diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,270034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 185.1851851851852, + "eval_steps": 500, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.018518518518518517, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009821385610848665, + "kl": 8.805232027953025e-06, + "learning_rate": 0.0, + "loss": 0.0, + "num_tokens": 268.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.037037037037037035, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.621858596801758, + "kl": 0.0004779777809744701, + "learning_rate": 3e-09, + "loss": 0.0384, + "num_tokens": 563.0, + "reward": 4.125, + "reward_std": 4.308422088623047, + "rewards/reward_combined/mean": 4.125, + "rewards/reward_combined/std": 4.308422088623047, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.05555555555555555, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.357028007507324, + "kl": 0.0008478831732645631, + "learning_rate": 6e-09, + "loss": -0.2878, + "num_tokens": 873.0, + "reward": 1.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 1.75, + "rewards/reward_combined/std": 1.5, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 5.25, + "completions/mean_terminated_length": 5.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 0.07407407407407407, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.526264667510986, + "kl": 0.0006631783908233047, + "learning_rate": 9.000000000000001e-09, + "loss": -0.0004, + "num_tokens": 1114.0, + "reward": 2.75, + "reward_std": 1.443375587463379, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.4433757066726685, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 37.0, + "completions/mean_terminated_length": 37.0, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.09259259259259259, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6715641021728516, + "kl": 0.0006134712166385725, + "learning_rate": 1.2e-08, + "loss": -0.0587, + "num_tokens": 1486.0, + "reward": 0.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 0.375, + "rewards/reward_combined/std": 2.25, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.1111111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.999401569366455, + "kl": 0.0002802525559673086, + "learning_rate": 1.5000000000000002e-08, + "loss": -0.0788, + "num_tokens": 1772.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 30.5, + "completions/mean_terminated_length": 30.5, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.12962962962962962, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.5117316246032715, + "kl": 0.0005427531723398715, + "learning_rate": 1.8000000000000002e-08, + "loss": 0.066, + "num_tokens": 2110.0, + "reward": 0.875, + "reward_std": 2.0564937591552734, + "rewards/reward_combined/mean": 0.875, + "rewards/reward_combined/std": 2.0564937591552734, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.14814814814814814, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002674127172213048, + "kl": 2.959370590360777e-06, + "learning_rate": 2.1e-08, + "loss": 0.0, + "num_tokens": 2330.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.16666666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.315208435058594, + "kl": 0.0001467584806960076, + "learning_rate": 2.4e-08, + "loss": 0.4813, + "num_tokens": 2613.0, + "reward": 2.25, + "reward_std": 2.5, + "rewards/reward_combined/mean": 2.25, + "rewards/reward_combined/std": 2.5, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.18518518518518517, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008726971223950386, + "kl": 9.009348764266178e-05, + "learning_rate": 2.7e-08, + "loss": 0.0, + "num_tokens": 2918.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.2037037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.494390487670898, + "kl": 0.00012619226527021965, + "learning_rate": 3.0000000000000004e-08, + "loss": 0.1288, + "num_tokens": 3216.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.2222222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008680141530930996, + "kl": 0.00012682732995017432, + "learning_rate": 3.3e-08, + "loss": 0.0, + "num_tokens": 3451.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.005102040711790323, + "clip_ratio/low_min": 0.005102040711790323, + "clip_ratio/region_mean": 0.005102040711790323, + "completion_length": 34.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 34.5, + "completions/mean_terminated_length": 34.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.24074074074074073, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5816242694854736, + "kl": 0.000613169715506956, + "learning_rate": 3.6000000000000005e-08, + "loss": -0.0895, + "num_tokens": 3841.0, + "reward": 3.0, + "reward_std": 5.582711219787598, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 5.582711696624756, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.25925925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004939392674714327, + "kl": 0.00022975738829700276, + "learning_rate": 3.9e-08, + "loss": 0.0, + "num_tokens": 4135.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 9.75, + "completions/mean_terminated_length": 9.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.2777777777777778, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.921114444732666, + "kl": 0.0004165449208812788, + "learning_rate": 4.2e-08, + "loss": 0.0782, + "num_tokens": 4394.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 82.0, + "completions/max_terminated_length": 82.0, + "completions/mean_length": 30.25, + "completions/mean_terminated_length": 30.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.2962962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9938137531280518, + "kl": 0.00014287605335994158, + "learning_rate": 4.5e-08, + "loss": 0.363, + "num_tokens": 4747.0, + "reward": 7.5, + "reward_std": 1.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 1.0, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 35.5, + "completions/mean_terminated_length": 35.5, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.3148148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.363356828689575, + "kl": 9.002560909721069e-05, + "learning_rate": 4.8e-08, + "loss": -0.0352, + "num_tokens": 5169.0, + "reward": -1.4500000476837158, + "reward_std": 3.2827835083007812, + "rewards/reward_combined/mean": -1.4500000476837158, + "rewards/reward_combined/std": 3.2827835083007812, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.3333333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.03240442276001, + "kl": 0.00012843777949456125, + "learning_rate": 5.100000000000001e-08, + "loss": 0.1394, + "num_tokens": 5446.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.0, + "completions/max_terminated_length": 46.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.35185185185185186, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014763526618480682, + "kl": 0.000279415808563499, + "learning_rate": 5.4e-08, + "loss": 0.0, + "num_tokens": 5752.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.37037037037037035, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9281020164489746, + "kl": 1.8599247951556208e-05, + "learning_rate": 5.7e-08, + "loss": 0.0351, + "num_tokens": 6038.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.3888888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.528041839599609, + "kl": 0.0005270973924780264, + "learning_rate": 6.000000000000001e-08, + "loss": 0.0281, + "num_tokens": 6295.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.4074074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004833583254367113, + "kl": 6.678203693155638e-05, + "learning_rate": 6.300000000000001e-08, + "loss": 0.0, + "num_tokens": 6559.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.42592592592592593, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013450962491333485, + "kl": 0.00042931419739034027, + "learning_rate": 6.6e-08, + "loss": 0.0, + "num_tokens": 6831.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0625, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 0.4444444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.01920747756958, + "kl": 0.0005032640910940245, + "learning_rate": 6.9e-08, + "loss": 0.178, + "num_tokens": 7091.0, + "reward": 2.25, + "reward_std": 2.5, + "rewards/reward_combined/mean": 2.25, + "rewards/reward_combined/std": 2.5, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 0.46296296296296297, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.3310747110372176e-06, + "kl": -3.725290298461914e-09, + "learning_rate": 7.200000000000001e-08, + "loss": -0.0, + "num_tokens": 7311.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 82.75, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 82.75, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.48148148148148145, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.288567543029785, + "kl": 0.0004660324193537235, + "learning_rate": 7.500000000000001e-08, + "loss": 0.4202, + "num_tokens": 7870.0, + "reward": 2.174999952316284, + "reward_std": 1.649999976158142, + "rewards/reward_combined/mean": 2.174999952316284, + "rewards/reward_combined/std": 1.649999976158142, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 77.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 77.0, + "completions/mean_terminated_length": 17.33333396911621, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.563615083694458, + "kl": 0.0005240105965640396, + "learning_rate": 7.8e-08, + "loss": 0.3654, + "num_tokens": 8394.0, + "reward": 3.049999952316284, + "reward_std": 4.439594745635986, + "rewards/reward_combined/mean": 3.049999952316284, + "rewards/reward_combined/std": 4.439594268798828, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.5185185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 17.841129302978516, + "kl": 0.0007524208049289882, + "learning_rate": 8.1e-08, + "loss": 0.2033, + "num_tokens": 8650.0, + "reward": 2.75, + "reward_std": 2.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 2.5, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 5.5, + "completions/mean_terminated_length": 5.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 0.5370370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04252630099654198, + "kl": 0.0013804823393002152, + "learning_rate": 8.4e-08, + "loss": 0.0001, + "num_tokens": 8872.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 23.75, + "completions/mean_terminated_length": 23.75, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.5555555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005104560405015945, + "kl": 5.049820174463093e-06, + "learning_rate": 8.700000000000001e-08, + "loss": 0.0, + "num_tokens": 9235.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.5740740740740741, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.057951927185059, + "kl": 0.00030255227466113865, + "learning_rate": 9e-08, + "loss": 0.1186, + "num_tokens": 9551.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.5925925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.524298191070557, + "kl": 0.00030236503516789526, + "learning_rate": 9.3e-08, + "loss": 0.0886, + "num_tokens": 9823.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.6111111111111112, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.991766929626465, + "kl": 0.00025091033603530377, + "learning_rate": 9.6e-08, + "loss": 0.0282, + "num_tokens": 10088.0, + "reward": 2.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 2.25, + "rewards/reward_combined/std": 3.5, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 79.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 79.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.6296296296296297, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125446319580078, + "kl": 0.00030589695961680263, + "learning_rate": 9.900000000000001e-08, + "loss": 0.4322, + "num_tokens": 10632.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.6481481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04023784399032593, + "kl": 0.00029347091913223267, + "learning_rate": 1.0200000000000001e-07, + "loss": 0.0, + "num_tokens": 10842.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 89.25, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 89.25, + "completions/mean_terminated_length": 33.66666793823242, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.6666666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.945043921470642, + "kl": 0.00041686483018565923, + "learning_rate": 1.0500000000000001e-07, + "loss": 0.4212, + "num_tokens": 11427.0, + "reward": 5.25, + "reward_std": 5.5, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 5.5, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 0.6851851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07464797049760818, + "kl": 0.0011479780077934265, + "learning_rate": 1.08e-07, + "loss": 0.0001, + "num_tokens": 11639.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 47.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 95.0, + "completions/max_terminated_length": 95.0, + "completions/mean_length": 47.75, + "completions/mean_terminated_length": 47.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.7037037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.823786497116089, + "kl": 0.0004799226007889956, + "learning_rate": 1.11e-07, + "loss": -0.0151, + "num_tokens": 12054.0, + "reward": 1.2999999523162842, + "reward_std": 4.661187171936035, + "rewards/reward_combined/mean": 1.2999999523162842, + "rewards/reward_combined/std": 4.661187171936035, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.7222222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01007902156561613, + "kl": 0.0002532374801376136, + "learning_rate": 1.14e-07, + "loss": 0.0, + "num_tokens": 12403.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 9.75, + "completions/mean_terminated_length": 9.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.7407407407407407, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023423200473189354, + "kl": 0.0002661251783138141, + "learning_rate": 1.17e-07, + "loss": 0.0, + "num_tokens": 12670.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.7592592592592593, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.757240295410156, + "kl": 0.00041234656964661554, + "learning_rate": 1.2000000000000002e-07, + "loss": -0.0342, + "num_tokens": 12989.0, + "reward": 0.625, + "reward_std": 0.25, + "rewards/reward_combined/mean": 0.625, + "rewards/reward_combined/std": 0.25, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.7777777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01539209857583046, + "kl": 0.00012184950537630357, + "learning_rate": 1.23e-07, + "loss": 0.0, + "num_tokens": 13285.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.7962962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000477473484352231, + "kl": 1.5949416365401703e-06, + "learning_rate": 1.2600000000000002e-07, + "loss": 0.0, + "num_tokens": 13599.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.8148148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.974885940551758, + "kl": 0.00047116080531850457, + "learning_rate": 1.29e-07, + "loss": -0.1904, + "num_tokens": 13906.0, + "reward": 7.625, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.625, + "rewards/reward_combined/std": 0.25, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.8333333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007154799532145262, + "kl": 0.00013420979303191416, + "learning_rate": 1.32e-07, + "loss": 0.0, + "num_tokens": 14174.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.8518518518518519, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.206045627593994, + "kl": 0.00031978863989934325, + "learning_rate": 1.35e-07, + "loss": -0.0354, + "num_tokens": 14464.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.8703703703703703, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02522652968764305, + "kl": 0.00045043845602776855, + "learning_rate": 1.38e-07, + "loss": 0.0, + "num_tokens": 14740.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.8888888888888888, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.281634569168091, + "kl": 0.0002880931715480983, + "learning_rate": 1.41e-07, + "loss": 0.0011, + "num_tokens": 15060.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.9074074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.099334716796875, + "kl": 0.0003625357348937541, + "learning_rate": 1.4400000000000002e-07, + "loss": 0.1533, + "num_tokens": 15375.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 24.75, + "completions/mean_terminated_length": 24.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.9259259259259259, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.123101234436035, + "kl": 0.0005585844337474555, + "learning_rate": 1.47e-07, + "loss": 0.2408, + "num_tokens": 15718.0, + "reward": 4.625, + "reward_std": 4.308422088623047, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 4.308422088623047, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.9444444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.16160774230957, + "kl": 0.0004420234326971695, + "learning_rate": 1.5000000000000002e-07, + "loss": 0.0612, + "num_tokens": 16041.0, + "reward": 4.125, + "reward_std": 3.1721444129943848, + "rewards/reward_combined/mean": 4.125, + "rewards/reward_combined/std": 3.1721444129943848, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.9629629629629629, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6957550048828125, + "kl": 0.00021745844787801616, + "learning_rate": 1.53e-07, + "loss": 0.0535, + "num_tokens": 16337.0, + "reward": 5.25, + "reward_std": 5.5, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 5.5, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.9814814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.15764045715332, + "kl": 0.00020359230984468013, + "learning_rate": 1.56e-07, + "loss": -0.0346, + "num_tokens": 16650.0, + "reward": 4.5, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 4.041451930999756, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 38.0, + "completions/mean_terminated_length": 38.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 1.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2924599647521973, + "kl": 0.0005566470790654421, + "learning_rate": 1.59e-07, + "loss": -0.0093, + "num_tokens": 17038.0, + "reward": 2.0, + "reward_std": 4.6547465324401855, + "rewards/reward_combined/mean": 2.0, + "rewards/reward_combined/std": 4.654747009277344, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.017241379246115685, + "clip_ratio/low_min": 0.017241379246115685, + "clip_ratio/region_mean": 0.017241379246115685, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 1.0185185185185186, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.021492004394531, + "kl": 0.001439574727555737, + "learning_rate": 1.62e-07, + "loss": -0.0788, + "num_tokens": 17314.0, + "reward": 2.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 3.5, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 40.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 40.5, + "completions/mean_terminated_length": 40.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 1.037037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.524380922317505, + "kl": 0.00041615290683694184, + "learning_rate": 1.65e-07, + "loss": 0.2664, + "num_tokens": 17716.0, + "reward": 2.375, + "reward_std": 3.75, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 3.75, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 1.0555555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5236964225769043, + "kl": 3.65465457434766e-05, + "learning_rate": 1.68e-07, + "loss": 0.0292, + "num_tokens": 17978.0, + "reward": 3.375, + "reward_std": 1.25, + "rewards/reward_combined/mean": 3.375, + "rewards/reward_combined/std": 1.25, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.75, + "completions/mean_terminated_length": 11.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 1.074074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.133392333984375, + "kl": 0.0001687385083641857, + "learning_rate": 1.71e-07, + "loss": -0.0697, + "num_tokens": 18249.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 24.25, + "completions/mean_terminated_length": 24.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 1.0925925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004695798736065626, + "kl": 0.0001588340091984719, + "learning_rate": 1.7400000000000002e-07, + "loss": 0.0, + "num_tokens": 18594.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 1.1111111111111112, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.737401485443115, + "kl": 0.0004575830971589312, + "learning_rate": 1.7699999999999998e-07, + "loss": 0.0247, + "num_tokens": 18907.0, + "reward": 5.75, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 4.5, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 1.1296296296296295, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4742252826690674, + "kl": 5.735970080422703e-05, + "learning_rate": 1.8e-07, + "loss": 0.0259, + "num_tokens": 19177.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 1.1481481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016168957808986306, + "kl": 1.7136335372924805e-05, + "learning_rate": 1.83e-07, + "loss": 0.0, + "num_tokens": 19390.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 1.1666666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007183755282312632, + "kl": 0.0001739251489993876, + "learning_rate": 1.86e-07, + "loss": 0.0, + "num_tokens": 19679.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 1.1851851851851851, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9159224033355713, + "kl": 0.00013322310405783355, + "learning_rate": 1.89e-07, + "loss": -0.0022, + "num_tokens": 19955.0, + "reward": 5.75, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 2.598076105117798, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 23.75, + "completions/mean_terminated_length": 23.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 1.2037037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.185230255126953, + "kl": 0.0004092542003490962, + "learning_rate": 1.92e-07, + "loss": 0.15, + "num_tokens": 20302.0, + "reward": 4.5, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 4.041451930999756, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 1.2222222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3891133069992065, + "kl": 0.00010689443297451362, + "learning_rate": 1.95e-07, + "loss": -0.0209, + "num_tokens": 20668.0, + "reward": 1.625, + "reward_std": 1.25, + "rewards/reward_combined/mean": 1.625, + "rewards/reward_combined/std": 1.25, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 29.5, + "completions/mean_terminated_length": 29.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 1.2407407407407407, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.615692138671875, + "kl": 0.0004022928769700229, + "learning_rate": 1.9800000000000003e-07, + "loss": 0.0945, + "num_tokens": 21006.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 23.75, + "completions/mean_terminated_length": 23.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 1.2592592592592593, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6572940349578857, + "kl": 0.0006055706035112962, + "learning_rate": 2.01e-07, + "loss": 0.245, + "num_tokens": 21329.0, + "reward": 3.875, + "reward_std": 2.688710927963257, + "rewards/reward_combined/mean": 3.875, + "rewards/reward_combined/std": 2.688710927963257, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 1.2777777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.529059886932373, + "kl": 0.0003714864724315703, + "learning_rate": 2.0400000000000003e-07, + "loss": -0.0404, + "num_tokens": 21603.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 1.2962962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007392583065666258, + "kl": 1.1846422964367775e-05, + "learning_rate": 2.0700000000000001e-07, + "loss": 0.0, + "num_tokens": 21863.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 1.3148148148148149, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.763455629348755, + "kl": 0.0002756074536591768, + "learning_rate": 2.1000000000000003e-07, + "loss": 0.156, + "num_tokens": 22145.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 1.3333333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006291932892054319, + "kl": 6.271153688430786e-05, + "learning_rate": 2.13e-07, + "loss": 0.0, + "num_tokens": 22355.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 1.3518518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013468775898218155, + "kl": 0.0001944929754245095, + "learning_rate": 2.16e-07, + "loss": 0.0, + "num_tokens": 22619.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 1.3703703703703702, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007148392498493195, + "kl": 0.00010193139314651489, + "learning_rate": 2.19e-07, + "loss": 0.0, + "num_tokens": 22875.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 1.3888888888888888, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.728374719619751, + "kl": 0.00022550571520696394, + "learning_rate": 2.22e-07, + "loss": -0.0574, + "num_tokens": 23166.0, + "reward": 7.125, + "reward_std": 1.4361406564712524, + "rewards/reward_combined/mean": 7.125, + "rewards/reward_combined/std": 1.4361406564712524, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 1.4074074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.527256011962891, + "kl": 0.0014133312506601214, + "learning_rate": 2.25e-07, + "loss": 0.0118, + "num_tokens": 23472.0, + "reward": 1.75, + "reward_std": 4.804512023925781, + "rewards/reward_combined/mean": 1.75, + "rewards/reward_combined/std": 4.804512023925781, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 33.5, + "completions/mean_terminated_length": 33.5, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 1.425925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7154812812805176, + "kl": 0.00011746803647838533, + "learning_rate": 2.28e-07, + "loss": 0.0371, + "num_tokens": 23886.0, + "reward": -0.9500000476837158, + "reward_std": 2.245736837387085, + "rewards/reward_combined/mean": -0.9500000476837158, + "rewards/reward_combined/std": 2.245736837387085, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 1.4444444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0028595319017767906, + "kl": 9.809278344619088e-05, + "learning_rate": 2.31e-07, + "loss": 0.0, + "num_tokens": 24205.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 1.462962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0056686620227992535, + "kl": 9.486254481316791e-05, + "learning_rate": 2.34e-07, + "loss": 0.0, + "num_tokens": 24512.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 75.0, + "completions/max_terminated_length": 75.0, + "completions/mean_length": 23.75, + "completions/mean_terminated_length": 23.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 1.4814814814814814, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.017726421356201, + "kl": 0.00020752509590238333, + "learning_rate": 2.3700000000000002e-07, + "loss": 0.4469, + "num_tokens": 24827.0, + "reward": 1.5, + "reward_std": 2.4494898319244385, + "rewards/reward_combined/mean": 1.5, + "rewards/reward_combined/std": 2.4494898319244385, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 7.0, + "completions/mean_terminated_length": 7.0, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 1.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007125639356672764, + "kl": 9.506940477876924e-05, + "learning_rate": 2.4000000000000003e-07, + "loss": 0.0, + "num_tokens": 25063.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 1.5185185185185186, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8164546489715576, + "kl": 7.238993202918209e-05, + "learning_rate": 2.43e-07, + "loss": 0.0508, + "num_tokens": 25340.0, + "reward": 6.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 6.5, + "rewards/reward_combined/std": 2.0, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 1.5370370370370372, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00045325103565119207, + "kl": 5.364418029785156e-07, + "learning_rate": 2.46e-07, + "loss": 0.0, + "num_tokens": 25552.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 36.75, + "completions/mean_terminated_length": 36.75, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 1.5555555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.442258358001709, + "kl": 0.00023098269593901932, + "learning_rate": 2.49e-07, + "loss": 0.013, + "num_tokens": 25923.0, + "reward": 4.625, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 1.574074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8734862208366394, + "kl": 8.921697735786438e-05, + "learning_rate": 2.5200000000000003e-07, + "loss": -0.0021, + "num_tokens": 26235.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 1.5925925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5689330101013184, + "kl": 0.0003830389105132781, + "learning_rate": 2.5500000000000005e-07, + "loss": 0.0298, + "num_tokens": 26516.0, + "reward": 5.5, + "reward_std": 5.0, + "rewards/reward_combined/mean": 5.5, + "rewards/reward_combined/std": 5.0, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 1.6111111111111112, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018779064994305372, + "kl": 5.648910428135423e-05, + "learning_rate": 2.58e-07, + "loss": 0.0, + "num_tokens": 26806.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 1.6296296296296298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028449570760130882, + "kl": 0.0003179311752319336, + "learning_rate": 2.6099999999999997e-07, + "loss": 0.0, + "num_tokens": 27022.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 1.6481481481481481, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8877177238464355, + "kl": 0.00024160796601790935, + "learning_rate": 2.64e-07, + "loss": 0.0515, + "num_tokens": 27329.0, + "reward": 5.875, + "reward_std": 2.462214469909668, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 2.462214469909668, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 34.0, + "completions/mean_terminated_length": 34.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 1.6666666666666665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02491728961467743, + "kl": 0.0009081092721316963, + "learning_rate": 2.67e-07, + "loss": 0.0, + "num_tokens": 27685.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 90.75, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 90.75, + "completions/mean_terminated_length": 35.66666793823242, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 1.6851851851851851, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6458799839019775, + "kl": 0.0007071812287904322, + "learning_rate": 2.7e-07, + "loss": 0.5181, + "num_tokens": 28264.0, + "reward": 1.75, + "reward_std": 2.362907886505127, + "rewards/reward_combined/mean": 1.75, + "rewards/reward_combined/std": 2.362907886505127, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 35.25, + "completions/mean_terminated_length": 35.25, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 1.7037037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.708519458770752, + "kl": 0.0002975300085381605, + "learning_rate": 2.73e-07, + "loss": -0.0639, + "num_tokens": 28641.0, + "reward": 1.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 1.25, + "rewards/reward_combined/std": 1.5, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 1.7222222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016283635050058365, + "kl": 0.0003545835934346542, + "learning_rate": 2.76e-07, + "loss": 0.0, + "num_tokens": 28975.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 83.75, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 83.75, + "completions/mean_terminated_length": 26.33333396911621, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 1.7407407407407407, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6538562774658203, + "kl": 0.00046103380736894906, + "learning_rate": 2.79e-07, + "loss": 0.4983, + "num_tokens": 29562.0, + "reward": 0.6749999523162842, + "reward_std": 4.972172737121582, + "rewards/reward_combined/mean": 0.6749999523162842, + "rewards/reward_combined/std": 4.972172737121582, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 89.25, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 89.25, + "completions/mean_terminated_length": 33.66666793823242, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 1.7592592592592593, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.086613893508911, + "kl": 0.0005300322663970292, + "learning_rate": 2.82e-07, + "loss": -0.0731, + "num_tokens": 30147.0, + "reward": 1.5, + "reward_std": 1.3540064096450806, + "rewards/reward_combined/mean": 1.5, + "rewards/reward_combined/std": 1.3540064096450806, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 1.7777777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 13.068719863891602, + "kl": 0.00040509529645760267, + "learning_rate": 2.85e-07, + "loss": -0.2616, + "num_tokens": 30366.0, + "reward": 3.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 3.875, + "rewards/reward_combined/std": 0.25, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 1.7962962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.969927325262688e-06, + "kl": 0.0, + "learning_rate": 2.8800000000000004e-07, + "loss": 0.0, + "num_tokens": 30586.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 1.8148148148148149, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03004569560289383, + "kl": 0.0005588829517364502, + "learning_rate": 2.91e-07, + "loss": 0.0, + "num_tokens": 30830.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 1.8333333333333335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006689121946692467, + "kl": 0.00021418453980004415, + "learning_rate": 2.94e-07, + "loss": 0.0, + "num_tokens": 31121.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 32.5, + "completions/mean_terminated_length": 32.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 1.8518518518518519, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5398921966552734, + "kl": 0.00028973876032978296, + "learning_rate": 2.97e-07, + "loss": -0.0279, + "num_tokens": 31479.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 1.8703703703703702, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008573658764362335, + "kl": 0.00012734341544273775, + "learning_rate": 3.0000000000000004e-07, + "loss": 0.0, + "num_tokens": 31761.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 1.8888888888888888, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.150824069976807, + "kl": 0.0006834420200902969, + "learning_rate": 3.0300000000000005e-07, + "loss": 0.0776, + "num_tokens": 32063.0, + "reward": 5.5, + "reward_std": 5.0, + "rewards/reward_combined/mean": 5.5, + "rewards/reward_combined/std": 5.0, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 1.9074074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.890849590301514, + "kl": 0.00020996305920562008, + "learning_rate": 3.06e-07, + "loss": 0.1167, + "num_tokens": 32384.0, + "reward": 4.5, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 4.041451930999756, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 5.25, + "completions/mean_terminated_length": 5.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 1.925925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 10.60103702545166, + "kl": 8.521063136868179e-05, + "learning_rate": 3.09e-07, + "loss": 0.1945, + "num_tokens": 32625.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 1.9444444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048797205090522766, + "kl": 0.0004254840314388275, + "learning_rate": 3.12e-07, + "loss": 0.0, + "num_tokens": 32879.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 1.9629629629629628, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.783179998397827, + "kl": 0.00042832360486499965, + "learning_rate": 3.15e-07, + "loss": 0.1537, + "num_tokens": 33160.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 1.9814814814814814, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.766118049621582, + "kl": 0.0008387027846765704, + "learning_rate": 3.18e-07, + "loss": -0.0625, + "num_tokens": 33450.0, + "reward": 2.875, + "reward_std": 3.3008837699890137, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 3.3008837699890137, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 32.5, + "completions/mean_terminated_length": 32.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 2.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.873804569244385, + "kl": 0.0007832607952877879, + "learning_rate": 3.21e-07, + "loss": 0.2217, + "num_tokens": 33828.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 2.0185185185185186, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.612246513366699, + "kl": 0.000577682916627964, + "learning_rate": 3.24e-07, + "loss": -0.0154, + "num_tokens": 34133.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 24.25, + "completions/mean_terminated_length": 24.25, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 2.037037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01946515031158924, + "kl": 0.0006353051285259426, + "learning_rate": 3.27e-07, + "loss": 0.0, + "num_tokens": 34470.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 0.5, + "rewards/reward_combined/std": 0.0, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 34.25, + "completions/mean_terminated_length": 34.25, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 2.0555555555555554, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6852893829345703, + "kl": 0.0001253573518624762, + "learning_rate": 3.3e-07, + "loss": -0.0299, + "num_tokens": 34887.0, + "reward": 0.25, + "reward_std": 0.28867512941360474, + "rewards/reward_combined/mean": 0.25, + "rewards/reward_combined/std": 0.28867512941360474, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 23.75, + "completions/mean_terminated_length": 23.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 2.074074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2597243785858154, + "kl": 0.00020556408708216622, + "learning_rate": 3.3300000000000003e-07, + "loss": -0.0082, + "num_tokens": 35210.0, + "reward": 5.625, + "reward_std": 2.75, + "rewards/reward_combined/mean": 5.625, + "rewards/reward_combined/std": 2.75, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 3.75, + "completions/mean_terminated_length": 3.75, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 2.0925925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017365755513310432, + "kl": 0.00045232250704430044, + "learning_rate": 3.36e-07, + "loss": 0.0, + "num_tokens": 35421.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 85.25, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 85.25, + "completions/mean_terminated_length": 28.33333396911621, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 2.111111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1956639289855957, + "kl": 0.0004044400993734598, + "learning_rate": 3.39e-07, + "loss": 0.2955, + "num_tokens": 35978.0, + "reward": 0.675000011920929, + "reward_std": 2.599198579788208, + "rewards/reward_combined/mean": 0.675000011920929, + "rewards/reward_combined/std": 2.599198579788208, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.25, + "completions/mean_terminated_length": 12.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 2.1296296296296298, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4492805004119873, + "kl": 0.00033524764876347035, + "learning_rate": 3.42e-07, + "loss": 0.1065, + "num_tokens": 36255.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 2.148148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7756409645080566, + "kl": 0.00026651281223166734, + "learning_rate": 3.4500000000000003e-07, + "loss": 0.053, + "num_tokens": 36573.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.007692307699471712, + "clip_ratio/low_min": 0.007692307699471712, + "clip_ratio/region_mean": 0.007692307699471712, + "completion_length": 49.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 49.25, + "completions/mean_terminated_length": 49.25, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 2.1666666666666665, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.275442361831665, + "kl": 0.0005813548341393471, + "learning_rate": 3.4800000000000005e-07, + "loss": -0.0847, + "num_tokens": 37022.0, + "reward": 4.125, + "reward_std": 4.190763473510742, + "rewards/reward_combined/mean": 4.125, + "rewards/reward_combined/std": 4.190763473510742, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 149.75, + "completions/clipped_ratio": 0.5, + "completions/max_length": 256.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 149.75, + "completions/mean_terminated_length": 43.5, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 2.185185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7361387014389038, + "kl": 0.0004337559803389013, + "learning_rate": 3.51e-07, + "loss": 0.1386, + "num_tokens": 37845.0, + "reward": -0.8250000476837158, + "reward_std": 5.578754425048828, + "rewards/reward_combined/mean": -0.8250000476837158, + "rewards/reward_combined/std": 5.57875394821167, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 2.2037037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.416597366333008, + "kl": 0.00023660504666622728, + "learning_rate": 3.5399999999999997e-07, + "loss": 0.038, + "num_tokens": 38140.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 2.2222222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010696391575038433, + "kl": 0.00010671019845176488, + "learning_rate": 3.57e-07, + "loss": 0.0, + "num_tokens": 38400.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 28.0, + "completions/mean_terminated_length": 28.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 2.240740740740741, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6675543785095215, + "kl": 0.00046405295142903924, + "learning_rate": 3.6e-07, + "loss": 0.0599, + "num_tokens": 38732.0, + "reward": 3.25, + "reward_std": 3.0686588287353516, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 3.0686588287353516, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 2.259259259259259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07305122166872025, + "kl": 0.0010722950100898743, + "learning_rate": 3.63e-07, + "loss": 0.0001, + "num_tokens": 38944.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 2.2777777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 12.21618938446045, + "kl": 0.0008814340108074248, + "learning_rate": 3.66e-07, + "loss": 0.2721, + "num_tokens": 39170.0, + "reward": 3.125, + "reward_std": 1.75, + "rewards/reward_combined/mean": 3.125, + "rewards/reward_combined/std": 1.75, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 2.2962962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.745265007019043, + "kl": 0.00030546652851626277, + "learning_rate": 3.69e-07, + "loss": 0.0257, + "num_tokens": 39530.0, + "reward": 2.0, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.0, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 2.314814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.3485212326049805, + "kl": 0.0005984032613923773, + "learning_rate": 3.72e-07, + "loss": 0.0998, + "num_tokens": 39828.0, + "reward": 1.75, + "reward_std": 4.27200174331665, + "rewards/reward_combined/mean": 1.75, + "rewards/reward_combined/std": 4.27200174331665, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 73.75, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 73.75, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 2.3333333333333335, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9052869081497192, + "kl": 0.00010325404946343042, + "learning_rate": 3.75e-07, + "loss": 0.4518, + "num_tokens": 40355.0, + "reward": 5.800000190734863, + "reward_std": 4.400000095367432, + "rewards/reward_combined/mean": 5.800000190734863, + "rewards/reward_combined/std": 4.400000095367432, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 2.351851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00480230525135994, + "kl": 5.0453531002858654e-05, + "learning_rate": 3.78e-07, + "loss": 0.0, + "num_tokens": 40681.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 90.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 90.5, + "completions/mean_terminated_length": 90.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 2.3703703703703702, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8034842014312744, + "kl": 0.0003667814744403586, + "learning_rate": 3.8100000000000004e-07, + "loss": 0.3378, + "num_tokens": 41263.0, + "reward": 3.299999952316284, + "reward_std": 4.982635974884033, + "rewards/reward_combined/mean": 3.299999952316284, + "rewards/reward_combined/std": 4.982636451721191, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 2.388888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015238815685734153, + "kl": 2.2547319531440735e-05, + "learning_rate": 3.84e-07, + "loss": 0.0, + "num_tokens": 41507.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 2.4074074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.311396598815918, + "kl": 0.0004804107011295855, + "learning_rate": 3.87e-07, + "loss": 0.0226, + "num_tokens": 41839.0, + "reward": 3.75, + "reward_std": 3.1224989891052246, + "rewards/reward_combined/mean": 3.75, + "rewards/reward_combined/std": 3.1224989891052246, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 2.425925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007267743814736605, + "kl": 0.00011053532580262981, + "learning_rate": 3.9e-07, + "loss": 0.0, + "num_tokens": 42109.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 2.4444444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01588411070406437, + "kl": 0.0002482764102751389, + "learning_rate": 3.9300000000000004e-07, + "loss": 0.0, + "num_tokens": 42373.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 2.462962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453706741333008, + "kl": 0.0002344980457564816, + "learning_rate": 3.9600000000000005e-07, + "loss": 0.0328, + "num_tokens": 42665.0, + "reward": 4.0, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 4.041451930999756, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0035197327379137278, + "clip_ratio/low_min": 0.0035197327379137278, + "clip_ratio/region_mean": 0.0035197327379137278, + "completion_length": 142.25, + "completions/clipped_ratio": 0.5, + "completions/max_length": 256.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 142.25, + "completions/mean_terminated_length": 28.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 2.4814814814814814, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.361878514289856, + "kl": 0.0006233485473785549, + "learning_rate": 3.99e-07, + "loss": 0.6507, + "num_tokens": 43458.0, + "reward": 1.6749999523162842, + "reward_std": 4.897873878479004, + "rewards/reward_combined/mean": 1.6749999523162842, + "rewards/reward_combined/std": 4.897873878479004, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 2.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012103725224733353, + "kl": 0.0004016427083115559, + "learning_rate": 4.02e-07, + "loss": 0.0, + "num_tokens": 43736.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 2.5185185185185186, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021656809374690056, + "kl": 0.00031200129160424694, + "learning_rate": 4.0500000000000004e-07, + "loss": 0.0, + "num_tokens": 44001.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 2.537037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.230462551116943, + "kl": 0.0003037539281649515, + "learning_rate": 4.0800000000000005e-07, + "loss": 0.2245, + "num_tokens": 44278.0, + "reward": 6.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 6.5, + "rewards/reward_combined/std": 2.0, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 2.5555555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006961924955248833, + "kl": 6.836801912868395e-05, + "learning_rate": 4.1100000000000007e-07, + "loss": 0.0, + "num_tokens": 44515.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 2.574074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.957086563110352, + "kl": 0.00036130990338278934, + "learning_rate": 4.1400000000000003e-07, + "loss": 0.1463, + "num_tokens": 44792.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 2.5925925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.180386844789609e-05, + "kl": 4.3958425521850586e-07, + "learning_rate": 4.1700000000000004e-07, + "loss": 0.0, + "num_tokens": 45004.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 2.611111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019667502492666245, + "kl": 0.00045022181802778505, + "learning_rate": 4.2000000000000006e-07, + "loss": 0.0, + "num_tokens": 45294.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 2.6296296296296298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002667341032065451, + "kl": 4.52027666142385e-06, + "learning_rate": 4.2299999999999996e-07, + "loss": 0.0, + "num_tokens": 45571.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 2.648148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.808579444885254, + "kl": 0.0011022969265468419, + "learning_rate": 4.26e-07, + "loss": 0.1467, + "num_tokens": 45917.0, + "reward": 2.25, + "reward_std": 2.020725965499878, + "rewards/reward_combined/mean": 2.25, + "rewards/reward_combined/std": 2.020725965499878, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 2.6666666666666665, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.796252250671387, + "kl": 5.559250591602449e-05, + "learning_rate": 4.29e-07, + "loss": 0.0009, + "num_tokens": 46183.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 144 + }, + { + "clip_ratio/high_max": 0.009999999776482582, + "clip_ratio/high_mean": 0.009999999776482582, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009999999776482582, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 2.685185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.496193885803223, + "kl": 0.0005811657465528697, + "learning_rate": 4.32e-07, + "loss": -0.0265, + "num_tokens": 46515.0, + "reward": 6.75, + "reward_std": 2.5, + "rewards/reward_combined/mean": 6.75, + "rewards/reward_combined/std": 2.5, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 2.7037037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0275633335113525, + "kl": 0.0004115433621336706, + "learning_rate": 4.3499999999999996e-07, + "loss": -0.028, + "num_tokens": 46828.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 2.7222222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0039161560125648975, + "kl": 5.484670327859931e-05, + "learning_rate": 4.38e-07, + "loss": 0.0, + "num_tokens": 47084.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 24.25, + "completions/mean_terminated_length": 24.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 2.7407407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002509040350560099, + "kl": 3.664294965233239e-06, + "learning_rate": 4.41e-07, + "loss": 0.0, + "num_tokens": 47449.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 2.7592592592592595, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.516165256500244, + "kl": 0.0008194116380764171, + "learning_rate": 4.44e-07, + "loss": -0.1382, + "num_tokens": 47737.0, + "reward": 4.625, + "reward_std": 2.25, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 2.25, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 2.7777777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.077332973480225, + "kl": 0.0009726728312671185, + "learning_rate": 4.4699999999999997e-07, + "loss": 0.1408, + "num_tokens": 48024.0, + "reward": 1.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 1.75, + "rewards/reward_combined/std": 1.5, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 2.7962962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.980436325073242, + "kl": 0.0003069709346164018, + "learning_rate": 4.5e-07, + "loss": -0.0582, + "num_tokens": 48319.0, + "reward": 7.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 7.25, + "rewards/reward_combined/std": 1.5, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 2.814814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.034146308898926, + "kl": 0.0006649124989053234, + "learning_rate": 4.53e-07, + "loss": 0.0705, + "num_tokens": 48647.0, + "reward": 4.0, + "reward_std": 4.690415859222412, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 4.690415859222412, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 2.8333333333333335, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.877814531326294, + "kl": 0.0003990626319136936, + "learning_rate": 4.56e-07, + "loss": 0.0147, + "num_tokens": 48991.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.75, + "completions/mean_terminated_length": 11.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 2.851851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5918796062469482, + "kl": 0.00011902215555892326, + "learning_rate": 4.59e-07, + "loss": 0.0547, + "num_tokens": 49290.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 2.8703703703703702, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005021293181926012, + "kl": 8.40714678815857e-05, + "learning_rate": 4.62e-07, + "loss": 0.0, + "num_tokens": 49598.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 2.888888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039240170270204544, + "kl": 0.0008037164807319641, + "learning_rate": 4.65e-07, + "loss": 0.0, + "num_tokens": 49816.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 2.9074074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0026107507292181253, + "kl": 0.00021249623387120664, + "learning_rate": 4.68e-07, + "loss": 0.0, + "num_tokens": 50050.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 2.925925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006980568869039416, + "kl": 3.6558136343955994e-05, + "learning_rate": 4.71e-07, + "loss": 0.0, + "num_tokens": 50310.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 2.9444444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000600357714574784, + "kl": 1.3470649946611957e-05, + "learning_rate": 4.7400000000000004e-07, + "loss": 0.0, + "num_tokens": 50530.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 2.962962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.563671350479126, + "kl": 0.0005213647673372179, + "learning_rate": 4.77e-07, + "loss": 0.0744, + "num_tokens": 50865.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 75.25, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 75.25, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 2.9814814814814814, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.941039800643921, + "kl": 0.00040402429658570327, + "learning_rate": 4.800000000000001e-07, + "loss": 0.4818, + "num_tokens": 51382.0, + "reward": 2.549999952316284, + "reward_std": 3.652852773666382, + "rewards/reward_combined/mean": 2.549999952316284, + "rewards/reward_combined/std": 3.6528525352478027, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 3.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008686037617735565, + "kl": 1.1232991596443753e-05, + "learning_rate": 4.830000000000001e-07, + "loss": 0.0, + "num_tokens": 51694.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 3.0185185185185186, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.922219276428223, + "kl": 0.0002770378050627187, + "learning_rate": 4.86e-07, + "loss": 0.1201, + "num_tokens": 51959.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 26.75, + "completions/mean_terminated_length": 26.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 3.037037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.320078372955322, + "kl": 0.0008652110700495541, + "learning_rate": 4.89e-07, + "loss": 0.0749, + "num_tokens": 52318.0, + "reward": 5.75, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 2.598076105117798, + "step": 164 + }, + { + "clip_ratio/high_max": 0.001805054140277207, + "clip_ratio/high_mean": 0.001805054140277207, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001805054140277207, + "completion_length": 80.75, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 80.75, + "completions/mean_terminated_length": 22.33333396911621, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 3.0555555555555554, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.327766180038452, + "kl": 0.0004794775159098208, + "learning_rate": 4.92e-07, + "loss": 0.4238, + "num_tokens": 52861.0, + "reward": 7.300000190734863, + "reward_std": 0.40000009536743164, + "rewards/reward_combined/mean": 7.300000190734863, + "rewards/reward_combined/std": 0.40000009536743164, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 71.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 71.0, + "completions/mean_terminated_length": 9.333333969116211, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 3.074074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9788551330566406, + "kl": 0.00014810793072683737, + "learning_rate": 4.95e-07, + "loss": 0.425, + "num_tokens": 53369.0, + "reward": 4.0, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 4.041451930999756, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 3.0925925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.228071212768555, + "kl": 0.00040546040690969676, + "learning_rate": 4.98e-07, + "loss": 0.0601, + "num_tokens": 53701.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 3.111111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003313072375021875, + "kl": 7.852911949157715e-06, + "learning_rate": 5.01e-07, + "loss": 0.0, + "num_tokens": 53921.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 3.1296296296296298, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.729984283447266, + "kl": 0.00021809947793371975, + "learning_rate": 5.040000000000001e-07, + "loss": -0.0033, + "num_tokens": 54212.0, + "reward": 5.75, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 4.5, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 3.148148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005163852591067553, + "kl": 8.982420331449248e-05, + "learning_rate": 5.070000000000001e-07, + "loss": 0.0, + "num_tokens": 54472.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 3.1666666666666665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001008315128274262, + "kl": 7.666647434234619e-06, + "learning_rate": 5.100000000000001e-07, + "loss": 0.0, + "num_tokens": 54708.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 3.185185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017396019771695137, + "kl": 0.0002985633864227566, + "learning_rate": 5.13e-07, + "loss": 0.0, + "num_tokens": 54996.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 24.25, + "completions/mean_terminated_length": 24.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 3.2037037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3503596782684326, + "kl": 0.0009463485621381551, + "learning_rate": 5.16e-07, + "loss": -0.0942, + "num_tokens": 55333.0, + "reward": 0.42500001192092896, + "reward_std": 0.15000000596046448, + "rewards/reward_combined/mean": 0.42500001192092896, + "rewards/reward_combined/std": 0.14999999105930328, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 3.2222222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0455613136291504, + "kl": 0.00022622020333074033, + "learning_rate": 5.189999999999999e-07, + "loss": 0.0367, + "num_tokens": 55621.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 12.75, + "completions/mean_terminated_length": 12.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 3.240740740740741, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021082421764731407, + "kl": 0.00023952528135851026, + "learning_rate": 5.219999999999999e-07, + "loss": 0.0, + "num_tokens": 55872.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 3.259259259259259, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6618916988372803, + "kl": 3.9831074900575913e-05, + "learning_rate": 5.25e-07, + "loss": 0.0027, + "num_tokens": 56198.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 12.75, + "completions/mean_terminated_length": 12.75, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 3.2777777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.9709019660949707, + "kl": 0.00026796314341481775, + "learning_rate": 5.28e-07, + "loss": 0.0813, + "num_tokens": 56473.0, + "reward": 5.75, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 2.598076105117798, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 3.2962962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.1422834396362305, + "kl": 0.00045865429274272174, + "learning_rate": 5.31e-07, + "loss": -0.1223, + "num_tokens": 56796.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 47.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 47.25, + "completions/mean_terminated_length": 47.25, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 3.314814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.48903226852417, + "kl": 0.0006679217622149736, + "learning_rate": 5.34e-07, + "loss": 0.111, + "num_tokens": 57209.0, + "reward": -1.25, + "reward_std": 2.0615527629852295, + "rewards/reward_combined/mean": -1.25, + "rewards/reward_combined/std": 2.0615527629852295, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.013888888992369175, + "clip_ratio/low_min": 0.013888888992369175, + "clip_ratio/region_mean": 0.013888888992369175, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 3.3333333333333335, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.716423988342285, + "kl": 0.0008695235010236502, + "learning_rate": 5.37e-07, + "loss": -0.0669, + "num_tokens": 57518.0, + "reward": 4.0, + "reward_std": 2.915475845336914, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 2.915475845336914, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 3.351851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7347817420959473, + "kl": 0.00016060109555837698, + "learning_rate": 5.4e-07, + "loss": -0.0739, + "num_tokens": 57794.0, + "reward": 5.25, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 2.598076105117798, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 3.3703703703703702, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017193271778523922, + "kl": 3.500526327115949e-05, + "learning_rate": 5.43e-07, + "loss": 0.0, + "num_tokens": 58043.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 34.0, + "completions/mean_terminated_length": 34.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 3.388888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.462109327316284, + "kl": 0.0006443507445510477, + "learning_rate": 5.46e-07, + "loss": 0.2934, + "num_tokens": 58431.0, + "reward": 1.25, + "reward_std": 4.573474407196045, + "rewards/reward_combined/mean": 1.25, + "rewards/reward_combined/std": 4.573474407196045, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 3.4074074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9230544567108154, + "kl": 5.265841173240915e-05, + "learning_rate": 5.49e-07, + "loss": 0.0355, + "num_tokens": 58721.0, + "reward": 2.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 2.25, + "rewards/reward_combined/std": 3.5, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 3.425925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.5152268409729, + "kl": 0.0005125022289576009, + "learning_rate": 5.52e-07, + "loss": 0.1367, + "num_tokens": 59055.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 3.4444444444444446, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8972930908203125, + "kl": 0.0002818711072904989, + "learning_rate": 5.55e-07, + "loss": 0.0262, + "num_tokens": 59353.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 84.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 84.0, + "completions/mean_terminated_length": 26.666667938232422, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 3.462962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7512755393981934, + "kl": 0.0004527562850853428, + "learning_rate": 5.58e-07, + "loss": 0.2744, + "num_tokens": 59905.0, + "reward": 3.5, + "reward_std": 3.674234628677368, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 3.674234628677368, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 3.4814814814814814, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.72933292388916, + "kl": 0.0010080479551106691, + "learning_rate": 5.61e-07, + "loss": 0.1277, + "num_tokens": 60182.0, + "reward": 4.375, + "reward_std": 4.190763473510742, + "rewards/reward_combined/mean": 4.375, + "rewards/reward_combined/std": 4.190763473510742, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.75, + "completions/mean_terminated_length": 11.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 3.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7610340118408203, + "kl": 0.0001639571419218555, + "learning_rate": 5.64e-07, + "loss": 0.0005, + "num_tokens": 60453.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 28.0, + "completions/mean_terminated_length": 28.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 3.5185185185185186, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.874295711517334, + "kl": 0.0006885581533424556, + "learning_rate": 5.67e-07, + "loss": 0.0965, + "num_tokens": 60785.0, + "reward": 2.375, + "reward_std": 3.8810436725616455, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 3.8810436725616455, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 3.537037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004016809107270092, + "kl": 3.1813151508686133e-06, + "learning_rate": 5.7e-07, + "loss": 0.0, + "num_tokens": 61096.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 3.5555555555555554, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.614760398864746, + "kl": 0.00044257061745156534, + "learning_rate": 5.730000000000001e-07, + "loss": 0.1746, + "num_tokens": 61399.0, + "reward": 5.25, + "reward_std": 5.5, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 5.5, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 3.574074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004102109931409359, + "kl": 2.244114875793457e-05, + "learning_rate": 5.760000000000001e-07, + "loss": 0.0, + "num_tokens": 61607.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 3.5925925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0020067833829671144, + "kl": 7.15915666660294e-05, + "learning_rate": 5.79e-07, + "loss": 0.0, + "num_tokens": 61895.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 82.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 82.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 3.611111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.740919351577759, + "kl": 0.0005280704936012626, + "learning_rate": 5.82e-07, + "loss": 0.1512, + "num_tokens": 62451.0, + "reward": 1.7999999523162842, + "reward_std": 4.725110054016113, + "rewards/reward_combined/mean": 1.7999999523162842, + "rewards/reward_combined/std": 4.725110054016113, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 31.75, + "completions/mean_terminated_length": 31.75, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 3.6296296296296298, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.941718578338623, + "kl": 0.0002683470520423725, + "learning_rate": 5.85e-07, + "loss": 0.0117, + "num_tokens": 62858.0, + "reward": 0.925000011920929, + "reward_std": 1.4338176250457764, + "rewards/reward_combined/mean": 0.925000011920929, + "rewards/reward_combined/std": 1.433817744255066, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 3.648148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010845618322491646, + "kl": 0.00014817271221545525, + "learning_rate": 5.88e-07, + "loss": 0.0, + "num_tokens": 63124.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 3.6666666666666665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010536052286624908, + "kl": 0.00016219168901443481, + "learning_rate": 5.91e-07, + "loss": 0.0, + "num_tokens": 63336.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 3.685185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.782825559843332e-05, + "kl": 5.132135243002267e-06, + "learning_rate": 5.94e-07, + "loss": 0.0, + "num_tokens": 63644.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 3.7037037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00017704206402413547, + "kl": 4.967053683913036e-07, + "learning_rate": 5.970000000000001e-07, + "loss": 0.0, + "num_tokens": 63860.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 3.7222222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.628633077023551e-05, + "kl": 3.933285597668146e-06, + "learning_rate": 6.000000000000001e-07, + "loss": 0.0, + "num_tokens": 64224.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 3.7407407407407405, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8029863834381104, + "kl": 0.0006761455442756414, + "learning_rate": 6.030000000000001e-07, + "loss": 0.0419, + "num_tokens": 64550.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 68.5, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 68.5, + "completions/mean_terminated_length": 6.0, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 3.7592592592592595, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2645158767700195, + "kl": 0.00014356183964991942, + "learning_rate": 6.060000000000001e-07, + "loss": 0.4782, + "num_tokens": 65032.0, + "reward": 2.924999952316284, + "reward_std": 2.1500000953674316, + "rewards/reward_combined/mean": 2.924999952316284, + "rewards/reward_combined/std": 2.1500000953674316, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 3.7777777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.375810146331787, + "kl": 0.0004472557484405115, + "learning_rate": 6.09e-07, + "loss": 0.0486, + "num_tokens": 65339.0, + "reward": 1.125, + "reward_std": 1.8427786827087402, + "rewards/reward_combined/mean": 1.125, + "rewards/reward_combined/std": 1.8427786827087402, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 3.7962962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016529974527657032, + "kl": 9.95248547042138e-06, + "learning_rate": 6.12e-07, + "loss": 0.0, + "num_tokens": 65558.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 3.814814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2477164268493652, + "kl": 0.00011237839862587862, + "learning_rate": 6.149999999999999e-07, + "loss": 0.0127, + "num_tokens": 65878.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 3.8333333333333335, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.262871265411377, + "kl": 0.00013070139539195225, + "learning_rate": 6.18e-07, + "loss": 0.0289, + "num_tokens": 66192.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.75, + "completions/mean_terminated_length": 11.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 3.851851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010668719187378883, + "kl": 0.0002653169158293167, + "learning_rate": 6.21e-07, + "loss": 0.0, + "num_tokens": 66463.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 3.8703703703703702, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6986334323883057, + "kl": 2.062311068584677e-05, + "learning_rate": 6.24e-07, + "loss": 0.0016, + "num_tokens": 66740.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 3.888888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.502748966217041, + "kl": 0.0008518050017300993, + "learning_rate": 6.27e-07, + "loss": 0.0399, + "num_tokens": 67051.0, + "reward": 3.0, + "reward_std": 3.316624879837036, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 3.316624879837036, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 3.9074074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010843516327440739, + "kl": 0.000132754968944937, + "learning_rate": 6.3e-07, + "loss": 0.0, + "num_tokens": 67351.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 3.925925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00037427424103952944, + "kl": 1.382132359140087e-05, + "learning_rate": 6.33e-07, + "loss": 0.0, + "num_tokens": 67613.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 3.9444444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008297150954604149, + "kl": 0.00022291956702247262, + "learning_rate": 6.36e-07, + "loss": 0.0, + "num_tokens": 67886.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 3.962962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.821705341339111, + "kl": 0.0011893765986314975, + "learning_rate": 6.39e-07, + "loss": 0.0412, + "num_tokens": 68144.0, + "reward": 2.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 2.0, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 3.9814814814814814, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014322387985885143, + "kl": 0.0002677934747055133, + "learning_rate": 6.42e-07, + "loss": 0.0, + "num_tokens": 68442.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 98.25, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 98.25, + "completions/mean_terminated_length": 45.66666793823242, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 4.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.120150089263916, + "kl": 0.00032934667251538485, + "learning_rate": 6.45e-07, + "loss": 0.0681, + "num_tokens": 69071.0, + "reward": 1.2999999523162842, + "reward_std": 1.9646884202957153, + "rewards/reward_combined/mean": 1.2999999523162842, + "rewards/reward_combined/std": 1.9646883010864258, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 4.018518518518518, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.7130351066589355, + "kl": 0.00020774344011442736, + "learning_rate": 6.48e-07, + "loss": 0.2083, + "num_tokens": 69388.0, + "reward": 4.5, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 4.041451930999756, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 4.037037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.293214797973633, + "kl": 0.00045467575546354055, + "learning_rate": 6.51e-07, + "loss": 0.1662, + "num_tokens": 69699.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 4.055555555555555, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0022832085378468037, + "kl": 2.7919808985643613e-05, + "learning_rate": 6.54e-07, + "loss": 0.0, + "num_tokens": 69915.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 4.074074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.135390281677246, + "kl": 0.0006713748298352584, + "learning_rate": 6.57e-07, + "loss": -0.0096, + "num_tokens": 70218.0, + "reward": 5.875, + "reward_std": 2.462214469909668, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 2.462214469909668, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 4.092592592592593, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008521368145011365, + "kl": 1.3694167137145996e-05, + "learning_rate": 6.6e-07, + "loss": 0.0, + "num_tokens": 70462.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.5, + "completions/mean_terminated_length": 4.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 4.111111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.929198265075684, + "kl": 0.0007695133681409061, + "learning_rate": 6.63e-07, + "loss": 0.0979, + "num_tokens": 70700.0, + "reward": 2.5, + "reward_std": 3.0, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 3.0, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 4.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013419704046100378, + "kl": 2.1379492409323575e-05, + "learning_rate": 6.660000000000001e-07, + "loss": 0.0, + "num_tokens": 71020.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 4.148148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02378317527472973, + "kl": 0.00020574219524860382, + "learning_rate": 6.690000000000001e-07, + "loss": 0.0, + "num_tokens": 71280.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 4.166666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6256306171417236, + "kl": 0.0004987402644474059, + "learning_rate": 6.72e-07, + "loss": -0.0001, + "num_tokens": 71611.0, + "reward": 1.75, + "reward_std": 1.443375587463379, + "rewards/reward_combined/mean": 1.75, + "rewards/reward_combined/std": 1.4433757066726685, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 4.185185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030701482319273055, + "kl": 9.885265171760693e-06, + "learning_rate": 6.75e-07, + "loss": 0.0, + "num_tokens": 71923.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 24.25, + "completions/mean_terminated_length": 24.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 4.203703703703703, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.001032829284668, + "kl": 0.0008387054549530149, + "learning_rate": 6.78e-07, + "loss": 0.3488, + "num_tokens": 72240.0, + "reward": 7.0, + "reward_std": 1.0, + "rewards/reward_combined/mean": 7.0, + "rewards/reward_combined/std": 1.0, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 23.25, + "completions/mean_terminated_length": 23.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 4.222222222222222, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3922600746154785, + "kl": 0.0004214024083921686, + "learning_rate": 6.81e-07, + "loss": 0.1584, + "num_tokens": 72553.0, + "reward": 2.5, + "reward_std": 3.34165620803833, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 3.34165620803833, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 27.75, + "completions/mean_terminated_length": 27.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 4.2407407407407405, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5442514419555664, + "kl": 0.0010614169004838914, + "learning_rate": 6.84e-07, + "loss": -0.0212, + "num_tokens": 72900.0, + "reward": 1.875, + "reward_std": 1.6007810831069946, + "rewards/reward_combined/mean": 1.875, + "rewards/reward_combined/std": 1.6007810831069946, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 4.2592592592592595, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6496753692626953, + "kl": 0.0005403376708272845, + "learning_rate": 6.87e-07, + "loss": 0.1478, + "num_tokens": 73236.0, + "reward": 2.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 2.25, + "rewards/reward_combined/std": 3.5, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 37.0, + "completions/mean_terminated_length": 37.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 4.277777777777778, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4552295207977295, + "kl": 0.00028668949380517006, + "learning_rate": 6.900000000000001e-07, + "loss": -0.2265, + "num_tokens": 73604.0, + "reward": 4.125, + "reward_std": 2.25, + "rewards/reward_combined/mean": 4.125, + "rewards/reward_combined/std": 2.25, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 4.296296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.199924945831299, + "kl": 0.00014511148037854582, + "learning_rate": 6.930000000000001e-07, + "loss": 0.0255, + "num_tokens": 73865.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 4.314814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.181790828704834, + "kl": 0.0004374007985461503, + "learning_rate": 6.960000000000001e-07, + "loss": 0.0313, + "num_tokens": 74169.0, + "reward": 1.625, + "reward_std": 1.25, + "rewards/reward_combined/mean": 1.625, + "rewards/reward_combined/std": 1.25, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 4.333333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.676959276199341, + "kl": 0.001127946306951344, + "learning_rate": 6.990000000000001e-07, + "loss": 0.0259, + "num_tokens": 74450.0, + "reward": 5.25, + "reward_std": 3.4034295082092285, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 3.4034297466278076, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 23.25, + "completions/mean_terminated_length": 23.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 4.351851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.054653167724609, + "kl": 0.00033280889329034835, + "learning_rate": 7.02e-07, + "loss": 0.2928, + "num_tokens": 74755.0, + "reward": 7.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.875, + "rewards/reward_combined/std": 0.25, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 4.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008404637686908245, + "kl": 0.00020041676543769427, + "learning_rate": 7.05e-07, + "loss": 0.0, + "num_tokens": 75119.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 12.75, + "completions/mean_terminated_length": 12.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 4.388888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.890386581420898, + "kl": 0.0002970975074276794, + "learning_rate": 7.079999999999999e-07, + "loss": 0.178, + "num_tokens": 75390.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 4.407407407407407, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.346397327026352e-05, + "kl": 1.4379620552062988e-06, + "learning_rate": 7.11e-07, + "loss": 0.0, + "num_tokens": 75610.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 4.425925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011283610947430134, + "kl": 2.702176516322652e-05, + "learning_rate": 7.14e-07, + "loss": 0.0, + "num_tokens": 75870.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 4.444444444444445, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01615816354751587, + "kl": 0.0001490861177444458, + "learning_rate": 7.17e-07, + "loss": 0.0, + "num_tokens": 76126.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 45.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 45.25, + "completions/mean_terminated_length": 45.25, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 4.462962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8408299684524536, + "kl": 0.00019194966807845049, + "learning_rate": 7.2e-07, + "loss": -0.068, + "num_tokens": 76531.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 4.481481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039353568106889725, + "kl": 0.0008344904235855211, + "learning_rate": 7.23e-07, + "loss": 0.0, + "num_tokens": 76799.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 88.25, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 88.25, + "completions/mean_terminated_length": 32.333335876464844, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 4.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2978901863098145, + "kl": 0.0006689954025205225, + "learning_rate": 7.26e-07, + "loss": 0.4072, + "num_tokens": 77404.0, + "reward": 0.42500001192092896, + "reward_std": 0.5377421379089355, + "rewards/reward_combined/mean": 0.42500001192092896, + "rewards/reward_combined/std": 0.5377421975135803, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 35.75, + "completions/mean_terminated_length": 35.75, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 4.518518518518518, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2825461626052856, + "kl": 9.950984167517163e-05, + "learning_rate": 7.29e-07, + "loss": -0.0702, + "num_tokens": 77827.0, + "reward": 0.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 0.75, + "rewards/reward_combined/std": 1.5, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 4.537037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018169282702729106, + "kl": 2.6743327453004895e-05, + "learning_rate": 7.32e-07, + "loss": 0.0, + "num_tokens": 78061.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 4.555555555555555, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.9174177646636963, + "kl": 0.0008164856844814494, + "learning_rate": 7.350000000000001e-07, + "loss": 0.0358, + "num_tokens": 78343.0, + "reward": 7.0, + "reward_std": 2.0, + "rewards/reward_combined/mean": 7.0, + "rewards/reward_combined/std": 2.0, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 24.75, + "completions/mean_terminated_length": 24.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 4.574074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0350708961486816, + "kl": 0.0002487677338649519, + "learning_rate": 7.38e-07, + "loss": 0.1144, + "num_tokens": 78694.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 4.592592592592593, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.315531015396118, + "kl": 0.00017569374904269353, + "learning_rate": 7.41e-07, + "loss": -0.013, + "num_tokens": 79002.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 4.611111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.840476036071777, + "kl": 0.00015407590763061307, + "learning_rate": 7.44e-07, + "loss": 0.1457, + "num_tokens": 79279.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 4.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007033797446638346, + "kl": 0.00014726072549819946, + "learning_rate": 7.47e-07, + "loss": 0.0, + "num_tokens": 79495.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 71.75, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 71.75, + "completions/mean_terminated_length": 10.333333969116211, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 4.648148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.511259078979492, + "kl": 0.0008783047087490559, + "learning_rate": 7.5e-07, + "loss": 0.4654, + "num_tokens": 80010.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 4.666666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017466451972723007, + "kl": 0.0004449118932825513, + "learning_rate": 7.53e-07, + "loss": 0.0, + "num_tokens": 80278.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 4.685185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.481826305389404, + "kl": 0.00036090027424506843, + "learning_rate": 7.56e-07, + "loss": -0.0047, + "num_tokens": 80566.0, + "reward": 2.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 3.5, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 4.703703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010092331795021892, + "kl": 2.580881118774414e-05, + "learning_rate": 7.590000000000001e-07, + "loss": 0.0, + "num_tokens": 80774.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 4.722222222222222, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.845901966094971, + "kl": 0.000812845813925378, + "learning_rate": 7.620000000000001e-07, + "loss": 0.0011, + "num_tokens": 81049.0, + "reward": 4.5, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 4.041451930999756, + "step": 255 + }, + { + "clip_ratio/high_max": 0.011363636702299118, + "clip_ratio/high_mean": 0.011363636702299118, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.011363636702299118, + "completion_length": 23.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 23.75, + "completions/mean_terminated_length": 23.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 4.7407407407407405, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7894251346588135, + "kl": 0.00016434883582405746, + "learning_rate": 7.65e-07, + "loss": 0.029, + "num_tokens": 81388.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 4.7592592592592595, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.973078727722168, + "kl": 0.00030741008231416345, + "learning_rate": 7.68e-07, + "loss": 0.0279, + "num_tokens": 81684.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 4.777777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000896158569958061, + "kl": 1.6787877029855736e-05, + "learning_rate": 7.71e-07, + "loss": 0.0, + "num_tokens": 81968.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 4.796296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0024601693730801344, + "kl": 6.632786244153976e-05, + "learning_rate": 7.74e-07, + "loss": 0.0, + "num_tokens": 82280.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 4.814814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023117998614907265, + "kl": 0.0003232210933674651, + "learning_rate": 7.77e-07, + "loss": 0.0, + "num_tokens": 82499.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 4.833333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.515130043029785, + "kl": 0.00047652318608015776, + "learning_rate": 7.8e-07, + "loss": -0.0411, + "num_tokens": 82790.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 4.851851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011952731758356094, + "kl": 0.00011453933620941825, + "learning_rate": 7.830000000000001e-07, + "loss": 0.0, + "num_tokens": 83058.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 4.87037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.928591728210449, + "kl": 0.0006977445736993104, + "learning_rate": 7.860000000000001e-07, + "loss": -0.0346, + "num_tokens": 83348.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 35.75, + "completions/mean_terminated_length": 35.75, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 4.888888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.726197242736816, + "kl": 0.000552371478988789, + "learning_rate": 7.890000000000001e-07, + "loss": 0.0243, + "num_tokens": 83707.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 4.907407407407407, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03761586919426918, + "kl": 0.0003040581941604614, + "learning_rate": 7.920000000000001e-07, + "loss": 0.0, + "num_tokens": 83919.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 4.925925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.180360794067383, + "kl": 0.0006064912013243884, + "learning_rate": 7.95e-07, + "loss": -0.0948, + "num_tokens": 84215.0, + "reward": 4.625, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 4.944444444444445, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.671012878417969, + "kl": 0.0007608090818393975, + "learning_rate": 7.98e-07, + "loss": -0.0114, + "num_tokens": 84547.0, + "reward": 1.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 1.75, + "rewards/reward_combined/std": 1.5, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.0, + "completions/max_terminated_length": 46.0, + "completions/mean_length": 30.25, + "completions/mean_terminated_length": 30.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 4.962962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.696273326873779, + "kl": 0.0005270361725706607, + "learning_rate": 8.01e-07, + "loss": 0.1728, + "num_tokens": 84908.0, + "reward": 0.42500001192092896, + "reward_std": 0.14999999105930328, + "rewards/reward_combined/mean": 0.42500001192092896, + "rewards/reward_combined/std": 0.14999999105930328, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 4.981481481481482, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.0282697677612305, + "kl": 0.0012080430533387698, + "learning_rate": 8.04e-07, + "loss": 0.0341, + "num_tokens": 85193.0, + "reward": 2.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 2.25, + "rewards/reward_combined/std": 3.5, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 5.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011410972103476524, + "kl": 0.0003721677167050075, + "learning_rate": 8.070000000000001e-07, + "loss": 0.0, + "num_tokens": 85519.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.0, + "completions/mean_terminated_length": 7.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 5.018518518518518, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.44157600402832, + "kl": 0.00143510103225708, + "learning_rate": 8.100000000000001e-07, + "loss": -0.1716, + "num_tokens": 85759.0, + "reward": 3.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 3.875, + "rewards/reward_combined/std": 0.25, + "step": 271 + }, + { + "clip_ratio/high_max": 0.007462686393409967, + "clip_ratio/high_mean": 0.007462686393409967, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007462686393409967, + "completion_length": 30.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 30.25, + "completions/mean_terminated_length": 30.25, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 5.037037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5781354904174805, + "kl": 0.0007736670668236911, + "learning_rate": 8.130000000000001e-07, + "loss": 0.1346, + "num_tokens": 86096.0, + "reward": 2.25, + "reward_std": 1.443375587463379, + "rewards/reward_combined/mean": 2.25, + "rewards/reward_combined/std": 1.4433757066726685, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 5.055555555555555, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.370683670043945, + "kl": 0.0007411043479805812, + "learning_rate": 8.160000000000001e-07, + "loss": -0.0394, + "num_tokens": 86372.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 5.074074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6596527099609375, + "kl": 0.0003138103329547448, + "learning_rate": 8.190000000000001e-07, + "loss": -0.0019, + "num_tokens": 86658.0, + "reward": 5.75, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 2.598076105117798, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 5.092592592592593, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.514913558959961, + "kl": 0.0005813508469145745, + "learning_rate": 8.220000000000001e-07, + "loss": 0.0046, + "num_tokens": 86923.0, + "reward": 4.0, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 4.041451930999756, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 5.111111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009333821944892406, + "kl": 3.992146775999572e-05, + "learning_rate": 8.25e-07, + "loss": 0.0, + "num_tokens": 87200.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 5.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005320832133293152, + "kl": 7.409416139125824e-05, + "learning_rate": 8.280000000000001e-07, + "loss": 0.0, + "num_tokens": 87460.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 277 + }, + { + "clip_ratio/high_max": 0.011363636702299118, + "clip_ratio/high_mean": 0.011363636702299118, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.011363636702299118, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 5.148148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.405511856079102, + "kl": 0.0012021985021419823, + "learning_rate": 8.310000000000001e-07, + "loss": 0.0265, + "num_tokens": 87799.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 5.166666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 13.089540481567383, + "kl": 0.0009957485017366707, + "learning_rate": 8.340000000000001e-07, + "loss": -0.0519, + "num_tokens": 88057.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 5.185185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005388214252889156, + "kl": 0.00019227433949708939, + "learning_rate": 8.370000000000001e-07, + "loss": 0.0, + "num_tokens": 88377.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 5.203703703703703, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.229790210723877, + "kl": 0.0011238459264859557, + "learning_rate": 8.400000000000001e-07, + "loss": 0.0275, + "num_tokens": 88691.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 5.222222222222222, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.031806945800781, + "kl": 0.0004676797325373627, + "learning_rate": 8.430000000000001e-07, + "loss": 0.1669, + "num_tokens": 88970.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 27.25, + "completions/mean_terminated_length": 27.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 5.2407407407407405, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.256507873535156, + "kl": 0.0004470757266972214, + "learning_rate": 8.459999999999999e-07, + "loss": 0.2016, + "num_tokens": 89307.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 5.2592592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04802239313721657, + "kl": 0.0007820095343049616, + "learning_rate": 8.489999999999999e-07, + "loss": 0.0, + "num_tokens": 89579.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 31.5, + "completions/mean_terminated_length": 31.5, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 5.277777777777778, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3241055011749268, + "kl": 0.0007035969611024484, + "learning_rate": 8.52e-07, + "loss": -0.0473, + "num_tokens": 89933.0, + "reward": 2.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 3.5, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 38.75, + "completions/mean_terminated_length": 38.75, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 5.296296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4867427349090576, + "kl": 0.0001704170208540745, + "learning_rate": 8.55e-07, + "loss": 0.1531, + "num_tokens": 90312.0, + "reward": 4.125, + "reward_std": 4.308422088623047, + "rewards/reward_combined/mean": 4.125, + "rewards/reward_combined/std": 4.308422088623047, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 5.314814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015186072560027242, + "kl": 3.7874280678806826e-05, + "learning_rate": 8.58e-07, + "loss": 0.0, + "num_tokens": 90572.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 5.333333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018420862033963203, + "kl": 2.727508581301663e-05, + "learning_rate": 8.61e-07, + "loss": 0.0, + "num_tokens": 90792.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 5.351851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.638594150543213, + "kl": 0.0003076635766774416, + "learning_rate": 8.64e-07, + "loss": 0.0638, + "num_tokens": 91084.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 5.37037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.183140277862549, + "kl": 0.0020975497318431735, + "learning_rate": 8.669999999999999e-07, + "loss": 0.1627, + "num_tokens": 91345.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 26.75, + "completions/mean_terminated_length": 26.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 5.388888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027248112484812737, + "kl": 0.0008877874352037907, + "learning_rate": 8.699999999999999e-07, + "loss": 0.0, + "num_tokens": 91672.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 5.407407407407407, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007399424444884062, + "kl": 0.0001283470082853455, + "learning_rate": 8.729999999999999e-07, + "loss": 0.0, + "num_tokens": 91981.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 5.425925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.7383952140808105, + "kl": 0.0005632737884297967, + "learning_rate": 8.76e-07, + "loss": 0.031, + "num_tokens": 92291.0, + "reward": 4.125, + "reward_std": 3.902456521987915, + "rewards/reward_combined/mean": 4.125, + "rewards/reward_combined/std": 3.902456521987915, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 5.444444444444445, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.111842393875122, + "kl": 0.00033309114223811775, + "learning_rate": 8.79e-07, + "loss": 0.0634, + "num_tokens": 92633.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 5.462962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002238117391243577, + "kl": 5.358642010833137e-05, + "learning_rate": 8.82e-07, + "loss": 0.0, + "num_tokens": 92952.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 5.481481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035788360983133316, + "kl": 0.0009891540103126317, + "learning_rate": 8.85e-07, + "loss": 0.0, + "num_tokens": 93224.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 5.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07432208955287933, + "kl": 0.0010253414511680603, + "learning_rate": 8.88e-07, + "loss": 0.0001, + "num_tokens": 93436.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 5.518518518518518, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9599413871765137, + "kl": 0.00011427780555095524, + "learning_rate": 8.91e-07, + "loss": 0.0376, + "num_tokens": 93727.0, + "reward": 2.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 2.25, + "rewards/reward_combined/std": 3.5, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 39.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 121.0, + "completions/max_terminated_length": 121.0, + "completions/mean_length": 39.0, + "completions/mean_terminated_length": 39.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 5.537037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6163761615753174, + "kl": 0.0018340190581511706, + "learning_rate": 8.939999999999999e-07, + "loss": 0.3495, + "num_tokens": 94107.0, + "reward": 3.375, + "reward_std": 4.643543720245361, + "rewards/reward_combined/mean": 3.375, + "rewards/reward_combined/std": 4.643543720245361, + "step": 299 + }, + { + "clip_ratio/high_max": 0.014705882407724857, + "clip_ratio/high_mean": 0.014705882407724857, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.014705882407724857, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 5.555555555555555, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5918540954589844, + "kl": 0.0007441753841703758, + "learning_rate": 8.969999999999999e-07, + "loss": 0.0033, + "num_tokens": 94399.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 5.574074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023262349888682365, + "kl": 0.0008120479760691524, + "learning_rate": 9e-07, + "loss": 0.0, + "num_tokens": 94706.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 5.592592592592593, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007687571458518505, + "kl": 0.00011148526391480118, + "learning_rate": 9.03e-07, + "loss": 0.0, + "num_tokens": 94985.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 5.611111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013503205263987184, + "kl": 7.820480050213519e-06, + "learning_rate": 9.06e-07, + "loss": 0.0, + "num_tokens": 95293.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 5.62962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.17464542388916, + "kl": 0.0005902486263948958, + "learning_rate": 9.09e-07, + "loss": 0.1436, + "num_tokens": 95598.0, + "reward": 5.75, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 2.598076105117798, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 5.648148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009366406593471766, + "kl": 2.6514132514421362e-05, + "learning_rate": 9.12e-07, + "loss": 0.0, + "num_tokens": 95814.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 84.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 84.0, + "completions/mean_terminated_length": 26.666667938232422, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 5.666666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.890966773033142, + "kl": 0.00028350279171718284, + "learning_rate": 9.15e-07, + "loss": 0.477, + "num_tokens": 96394.0, + "reward": 4.375, + "reward_std": 4.190763473510742, + "rewards/reward_combined/mean": 4.375, + "rewards/reward_combined/std": 4.190763473510742, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 5.685185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01795792579650879, + "kl": 0.0003880493895849213, + "learning_rate": 9.18e-07, + "loss": 0.0, + "num_tokens": 96665.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 307 + }, + { + "clip_ratio/high_max": 0.011904762126505375, + "clip_ratio/high_mean": 0.011904762126505375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.011904762126505375, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 5.703703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3754703998565674, + "kl": 0.0007309003995032981, + "learning_rate": 9.210000000000001e-07, + "loss": -0.1105, + "num_tokens": 97011.0, + "reward": 0.25, + "reward_std": 0.5, + "rewards/reward_combined/mean": 0.25, + "rewards/reward_combined/std": 0.5, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 5.722222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015370920300483704, + "kl": 0.0002644389860506635, + "learning_rate": 9.24e-07, + "loss": 0.0, + "num_tokens": 97267.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 30.75, + "completions/mean_terminated_length": 30.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 5.7407407407407405, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.880171775817871, + "kl": 0.0003286706341896206, + "learning_rate": 9.27e-07, + "loss": 0.3679, + "num_tokens": 97610.0, + "reward": 3.75, + "reward_std": 2.723355770111084, + "rewards/reward_combined/mean": 3.75, + "rewards/reward_combined/std": 2.723355770111084, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 28.75, + "completions/mean_terminated_length": 28.75, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 5.7592592592592595, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7711498737335205, + "kl": 0.0010276629727741238, + "learning_rate": 9.3e-07, + "loss": 0.012, + "num_tokens": 97961.0, + "reward": 1.875, + "reward_std": 1.75, + "rewards/reward_combined/mean": 1.875, + "rewards/reward_combined/std": 1.75, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 5.777777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014316629618406296, + "kl": 0.00019802508359134663, + "learning_rate": 9.33e-07, + "loss": 0.0, + "num_tokens": 98195.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 5.796296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0020444500260055065, + "kl": 4.9935530114453286e-05, + "learning_rate": 9.36e-07, + "loss": 0.0, + "num_tokens": 98465.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 5.814814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.964408673113212e-05, + "kl": 1.3820827007293701e-06, + "learning_rate": 9.39e-07, + "loss": 0.0, + "num_tokens": 98685.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 5.833333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5785439014434814, + "kl": 0.0023260287125594914, + "learning_rate": 9.42e-07, + "loss": 0.0542, + "num_tokens": 99022.0, + "reward": 4.375, + "reward_std": 4.190763473510742, + "rewards/reward_combined/mean": 4.375, + "rewards/reward_combined/std": 4.190763473510742, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 5.851851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006944568012841046, + "kl": 7.503728056690306e-06, + "learning_rate": 9.450000000000001e-07, + "loss": 0.0, + "num_tokens": 99331.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 5.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003281471144873649, + "kl": 2.4133672695825226e-06, + "learning_rate": 9.480000000000001e-07, + "loss": 0.0, + "num_tokens": 99695.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 5.888888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.9598276615142822, + "kl": 0.0005881217948626727, + "learning_rate": 9.510000000000001e-07, + "loss": -0.0436, + "num_tokens": 99998.0, + "reward": 4.625, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 5.907407407407407, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.086658477783203, + "kl": 0.000580109772272408, + "learning_rate": 9.54e-07, + "loss": -0.0999, + "num_tokens": 100287.0, + "reward": 4.0, + "reward_std": 4.690415859222412, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 4.690415859222412, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 47.0, + "completions/max_terminated_length": 47.0, + "completions/mean_length": 37.0, + "completions/mean_terminated_length": 37.0, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 5.925925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3228225708007812, + "kl": 0.0007797148209647276, + "learning_rate": 9.570000000000001e-07, + "loss": -0.0007, + "num_tokens": 100715.0, + "reward": 0.125, + "reward_std": 0.25, + "rewards/reward_combined/mean": 0.125, + "rewards/reward_combined/std": 0.25, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 5.944444444444445, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004528549499809742, + "kl": 0.0001575574278831482, + "learning_rate": 9.600000000000001e-07, + "loss": 0.0, + "num_tokens": 100951.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 2.5, + "completions/mean_terminated_length": 2.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 5.962962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004139020573347807, + "kl": 2.9399991035461426e-05, + "learning_rate": 9.630000000000001e-07, + "loss": 0.0, + "num_tokens": 101157.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 36.25, + "completions/mean_terminated_length": 36.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 5.981481481481482, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0748610496520996, + "kl": 0.0004400626348797232, + "learning_rate": 9.660000000000002e-07, + "loss": -0.2451, + "num_tokens": 101554.0, + "reward": 4.125, + "reward_std": 4.190763473510742, + "rewards/reward_combined/mean": 4.125, + "rewards/reward_combined/std": 4.190763473510742, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 30.5, + "completions/mean_terminated_length": 30.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 6.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.219311714172363, + "kl": 0.0006390140042640269, + "learning_rate": 9.69e-07, + "loss": 0.0043, + "num_tokens": 101904.0, + "reward": 1.75, + "reward_std": 1.443375587463379, + "rewards/reward_combined/mean": 1.75, + "rewards/reward_combined/std": 1.4433757066726685, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 6.018518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005185985006392002, + "kl": 0.00015741996321594343, + "learning_rate": 9.72e-07, + "loss": 0.0, + "num_tokens": 102162.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 6.037037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7087786197662354, + "kl": 0.0027644065266940743, + "learning_rate": 9.75e-07, + "loss": -0.1186, + "num_tokens": 102496.0, + "reward": 3.25, + "reward_std": 3.4034295082092285, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 3.4034297466278076, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 6.055555555555555, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.0765700340271, + "kl": 0.0005693767161574215, + "learning_rate": 9.78e-07, + "loss": -0.0133, + "num_tokens": 102785.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.006172839552164078, + "clip_ratio/low_min": 0.006172839552164078, + "clip_ratio/region_mean": 0.006172839552164078, + "completion_length": 38.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 38.25, + "completions/mean_terminated_length": 38.25, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 6.074074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.328338623046875, + "kl": 0.000910238508367911, + "learning_rate": 9.81e-07, + "loss": -0.0884, + "num_tokens": 103218.0, + "reward": 0.675000011920929, + "reward_std": 1.0436315536499023, + "rewards/reward_combined/mean": 0.675000011920929, + "rewards/reward_combined/std": 1.0436315536499023, + "step": 328 + }, + { + "clip_ratio/high_max": 0.009615384973585606, + "clip_ratio/high_mean": 0.009615384973585606, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009615384973585606, + "completion_length": 23.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 23.75, + "completions/mean_terminated_length": 23.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 6.092592592592593, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.679901599884033, + "kl": 0.0006505083292722702, + "learning_rate": 9.84e-07, + "loss": 0.1025, + "num_tokens": 103533.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 6.111111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 16.89786148071289, + "kl": 0.006327513605356216, + "learning_rate": 9.87e-07, + "loss": 0.3069, + "num_tokens": 103751.0, + "reward": 1.625, + "reward_std": 3.75, + "rewards/reward_combined/mean": 1.625, + "rewards/reward_combined/std": 3.75, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 6.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004182321485131979, + "kl": 0.00012972205877304077, + "learning_rate": 9.9e-07, + "loss": 0.0, + "num_tokens": 103987.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 9.75, + "completions/mean_terminated_length": 9.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 6.148148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 13.095587730407715, + "kl": 0.0001467828915338032, + "learning_rate": 9.93e-07, + "loss": 0.1542, + "num_tokens": 104254.0, + "reward": 2.75, + "reward_std": 2.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 2.5, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 6.166666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005609396612271667, + "kl": 4.268196789780632e-06, + "learning_rate": 9.96e-07, + "loss": 0.0, + "num_tokens": 104561.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 6.185185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.383676052093506, + "kl": 0.0007989190053194761, + "learning_rate": 9.99e-07, + "loss": 0.0026, + "num_tokens": 104833.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 6.203703703703703, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02250610664486885, + "kl": 0.00023718326701782644, + "learning_rate": 1.002e-06, + "loss": 0.0, + "num_tokens": 105046.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 6.222222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012680215761065483, + "kl": 0.00019833587248285767, + "learning_rate": 1.0050000000000001e-06, + "loss": 0.0, + "num_tokens": 105281.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 6.2407407407407405, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.60512638092041, + "kl": 0.0003479543811408803, + "learning_rate": 1.0080000000000001e-06, + "loss": -0.0261, + "num_tokens": 105571.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 6.2592592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011472498998045921, + "kl": 0.00012253136264916975, + "learning_rate": 1.0110000000000001e-06, + "loss": 0.0, + "num_tokens": 105849.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 6.277777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007044440601021051, + "kl": 0.0002377020791755058, + "learning_rate": 1.0140000000000002e-06, + "loss": 0.0, + "num_tokens": 106144.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 86.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 86.0, + "completions/mean_terminated_length": 29.33333396911621, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 6.296296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2721306085586548, + "kl": 0.0005542633589357138, + "learning_rate": 1.0170000000000002e-06, + "loss": 0.4945, + "num_tokens": 106704.0, + "reward": 1.125, + "reward_std": 1.8427786827087402, + "rewards/reward_combined/mean": 1.125, + "rewards/reward_combined/std": 1.8427786827087402, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 6.314814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.622669696807861, + "kl": 0.00036851988988928497, + "learning_rate": 1.0200000000000002e-06, + "loss": -0.0036, + "num_tokens": 107006.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 6.333333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 16.67360496520996, + "kl": 0.0004253744962170458, + "learning_rate": 1.0230000000000002e-06, + "loss": -0.2063, + "num_tokens": 107223.0, + "reward": 3.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 3.875, + "rewards/reward_combined/std": 0.25, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 6.351851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.969684600830078, + "kl": 0.0002880445899791084, + "learning_rate": 1.026e-06, + "loss": 0.1829, + "num_tokens": 107489.0, + "reward": 2.625, + "reward_std": 1.75, + "rewards/reward_combined/mean": 2.625, + "rewards/reward_combined/std": 1.75, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 6.37037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.946965456008911, + "kl": 0.0006369232141878456, + "learning_rate": 1.029e-06, + "loss": 0.1015, + "num_tokens": 107810.0, + "reward": 3.25, + "reward_std": 3.0686588287353516, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 3.0686588287353516, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 75.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 75.0, + "completions/mean_terminated_length": 75.0, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 6.388888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1811447143554688, + "kl": 0.000475488806841895, + "learning_rate": 1.032e-06, + "loss": 0.3989, + "num_tokens": 108350.0, + "reward": 0.30000001192092896, + "reward_std": 0.4000000059604645, + "rewards/reward_combined/mean": 0.30000001192092896, + "rewards/reward_combined/std": 0.4000000059604645, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 29.75, + "completions/mean_terminated_length": 29.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 6.407407407407407, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.07658052444458, + "kl": 0.000156188674736768, + "learning_rate": 1.035e-06, + "loss": 0.2415, + "num_tokens": 108689.0, + "reward": 3.0, + "reward_std": 5.196152210235596, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 5.196152210235596, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 6.425925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025724634528160095, + "kl": 0.0005815433396492153, + "learning_rate": 1.0379999999999998e-06, + "loss": 0.0, + "num_tokens": 109009.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 6.444444444444445, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009777398081496358, + "kl": 4.061988875037059e-05, + "learning_rate": 1.0409999999999999e-06, + "loss": 0.0, + "num_tokens": 109286.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 32.75, + "completions/mean_terminated_length": 32.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 6.462962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8238067626953125, + "kl": 0.0007863019127398729, + "learning_rate": 1.0439999999999999e-06, + "loss": 0.1609, + "num_tokens": 109641.0, + "reward": 3.0, + "reward_std": 3.188521146774292, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 3.188521146774292, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 6.481481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00102147925645113, + "kl": 2.3031341243040515e-05, + "learning_rate": 1.0469999999999999e-06, + "loss": 0.0, + "num_tokens": 109884.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 27.5, + "completions/mean_terminated_length": 27.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 6.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.23366641998291, + "kl": 0.00045225843496154994, + "learning_rate": 1.05e-06, + "loss": 0.1784, + "num_tokens": 110246.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 6.518518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.293183832895011e-05, + "kl": 1.762683154993283e-06, + "learning_rate": 1.053e-06, + "loss": 0.0, + "num_tokens": 110610.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 6.537037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.090112686157227, + "kl": 0.0006403782754205167, + "learning_rate": 1.056e-06, + "loss": -0.1273, + "num_tokens": 110944.0, + "reward": 6.125, + "reward_std": 3.75, + "rewards/reward_combined/mean": 6.125, + "rewards/reward_combined/std": 3.75, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 6.555555555555555, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.235172748565674, + "kl": 0.0005330756175681017, + "learning_rate": 1.059e-06, + "loss": 0.0892, + "num_tokens": 111214.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 6.574074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.1353631019592285, + "kl": 0.000659962504869327, + "learning_rate": 1.062e-06, + "loss": -0.0512, + "num_tokens": 111480.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 29.75, + "completions/mean_terminated_length": 29.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 6.592592592592593, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5032594203948975, + "kl": 0.001133062643930316, + "learning_rate": 1.065e-06, + "loss": -0.3193, + "num_tokens": 111827.0, + "reward": 1.375, + "reward_std": 2.3228933811187744, + "rewards/reward_combined/mean": 1.375, + "rewards/reward_combined/std": 2.3228933811187744, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 6.611111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8815441131591797, + "kl": 0.00030877380777383223, + "learning_rate": 1.068e-06, + "loss": -0.0348, + "num_tokens": 112131.0, + "reward": 4.0, + "reward_std": 2.915475845336914, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 2.915475845336914, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 6.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.957538890768774e-05, + "kl": 4.023313522338867e-07, + "learning_rate": 1.071e-06, + "loss": 0.0, + "num_tokens": 112351.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 6.648148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019362876191735268, + "kl": 0.0006637527840211987, + "learning_rate": 1.074e-06, + "loss": 0.0, + "num_tokens": 112654.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 6.666666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.3079376220703125, + "kl": 0.004450089356396347, + "learning_rate": 1.077e-06, + "loss": 0.0795, + "num_tokens": 112946.0, + "reward": 7.125, + "reward_std": 1.4361406564712524, + "rewards/reward_combined/mean": 7.125, + "rewards/reward_combined/std": 1.4361406564712524, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 6.685185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8428308963775635, + "kl": 6.044354233836202e-05, + "learning_rate": 1.08e-06, + "loss": 0.0624, + "num_tokens": 113234.0, + "reward": 4.0, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 4.041451930999756, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 6.703703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.336367130279541, + "kl": 0.00424649418619083, + "learning_rate": 1.083e-06, + "loss": 0.0272, + "num_tokens": 113541.0, + "reward": 3.75, + "reward_std": 5.057997226715088, + "rewards/reward_combined/mean": 3.75, + "rewards/reward_combined/std": 5.057997226715088, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 6.722222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02751585654914379, + "kl": 0.0002781063230941072, + "learning_rate": 1.086e-06, + "loss": 0.0, + "num_tokens": 113797.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 6.7407407407407405, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.1264190673828125, + "kl": 0.00015835894737392664, + "learning_rate": 1.089e-06, + "loss": 0.0531, + "num_tokens": 114090.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 93.75, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 93.75, + "completions/mean_terminated_length": 39.66666793823242, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 6.7592592592592595, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.091813325881958, + "kl": 0.0004173710767645389, + "learning_rate": 1.092e-06, + "loss": 0.4636, + "num_tokens": 114689.0, + "reward": 0.2999999523162842, + "reward_std": 5.062279224395752, + "rewards/reward_combined/mean": 0.2999999523162842, + "rewards/reward_combined/std": 5.062278747558594, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 6.777777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011059912852942944, + "kl": 0.00016621185568510555, + "learning_rate": 1.095e-06, + "loss": 0.0, + "num_tokens": 115013.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 6.796296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006694111507385969, + "kl": 0.00016759575373725966, + "learning_rate": 1.098e-06, + "loss": 0.0, + "num_tokens": 115342.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 6.814814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008660645224153996, + "kl": 0.0002451826585456729, + "learning_rate": 1.101e-06, + "loss": 0.0, + "num_tokens": 115624.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 8.75, + "completions/mean_terminated_length": 8.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 6.833333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.97185230255127, + "kl": 0.0005101642454974353, + "learning_rate": 1.104e-06, + "loss": 0.3231, + "num_tokens": 115859.0, + "reward": 2.75, + "reward_std": 2.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 2.5, + "step": 369 + }, + { + "clip_ratio/high_max": 0.006849315017461777, + "clip_ratio/high_mean": 0.006849315017461777, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006849315017461777, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 6.851851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.35051965713501, + "kl": 0.0006835753229097463, + "learning_rate": 1.107e-06, + "loss": 0.1762, + "num_tokens": 116219.0, + "reward": 3.5, + "reward_std": 2.915475845336914, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 2.915475845336914, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 6.87037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.666428327560425, + "kl": 0.0007801286119502038, + "learning_rate": 1.11e-06, + "loss": 0.0288, + "num_tokens": 116480.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 6.888888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01839413307607174, + "kl": 0.0003345608856761828, + "learning_rate": 1.113e-06, + "loss": 0.0, + "num_tokens": 116740.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 6.907407407407407, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.535190105438232, + "kl": 0.0002525041636545211, + "learning_rate": 1.116e-06, + "loss": 0.0665, + "num_tokens": 117057.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 6.925925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.119668483734131, + "kl": 0.0008038908708840609, + "learning_rate": 1.119e-06, + "loss": 0.0319, + "num_tokens": 117350.0, + "reward": 3.5, + "reward_std": 5.446711540222168, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 5.446711540222168, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 6.944444444444445, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4843716621398926, + "kl": 0.00026746795265353285, + "learning_rate": 1.122e-06, + "loss": -0.0762, + "num_tokens": 117673.0, + "reward": 2.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 2.25, + "rewards/reward_combined/std": 3.5, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 6.962962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003542846767231822, + "kl": 6.279822991928086e-05, + "learning_rate": 1.125e-06, + "loss": 0.0, + "num_tokens": 117941.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 6.981481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0071050627157092094, + "kl": 0.0002686096850084141, + "learning_rate": 1.128e-06, + "loss": 0.0, + "num_tokens": 118209.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 2.5, + "completions/mean_terminated_length": 2.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 7.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0074279578402638435, + "kl": 7.043033838272095e-05, + "learning_rate": 1.131e-06, + "loss": 0.0, + "num_tokens": 118415.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 7.018518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02381051518023014, + "kl": 0.00021494428074220195, + "learning_rate": 1.134e-06, + "loss": 0.0, + "num_tokens": 118631.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 35.25, + "completions/mean_terminated_length": 35.25, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 7.037037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0429561138153076, + "kl": 0.0007445274095516652, + "learning_rate": 1.137e-06, + "loss": 0.1676, + "num_tokens": 119008.0, + "reward": 2.625, + "reward_std": 3.6371922492980957, + "rewards/reward_combined/mean": 2.625, + "rewards/reward_combined/std": 3.6371922492980957, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.001879699295386672, + "clip_ratio/low_min": 0.001879699295386672, + "clip_ratio/region_mean": 0.001879699295386672, + "completion_length": 71.5, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 71.5, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 7.055555555555555, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.623712420463562, + "kl": 0.0001712968969513895, + "learning_rate": 1.14e-06, + "loss": 0.4876, + "num_tokens": 119514.0, + "reward": 1.0499999523162842, + "reward_std": 3.2264533042907715, + "rewards/reward_combined/mean": 1.0499999523162842, + "rewards/reward_combined/std": 3.2264530658721924, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 7.074074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.947168827056885, + "kl": 0.0006473724788520485, + "learning_rate": 1.1430000000000001e-06, + "loss": 0.2235, + "num_tokens": 119819.0, + "reward": 3.125, + "reward_std": 1.4361406564712524, + "rewards/reward_combined/mean": 3.125, + "rewards/reward_combined/std": 1.4361406564712524, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.008771929889917374, + "clip_ratio/low_min": 0.008771929889917374, + "clip_ratio/region_mean": 0.008771929889917374, + "completion_length": 29.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 29.75, + "completions/mean_terminated_length": 29.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 7.092592592592593, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.786294937133789, + "kl": 0.0009779173415154219, + "learning_rate": 1.1460000000000001e-06, + "loss": 0.2366, + "num_tokens": 120158.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 7.111111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02692675217986107, + "kl": 0.00034688253072090447, + "learning_rate": 1.1490000000000001e-06, + "loss": 0.0, + "num_tokens": 120434.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 7.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033407099545001984, + "kl": 0.0017122626304626465, + "learning_rate": 1.1520000000000002e-06, + "loss": 0.0001, + "num_tokens": 120670.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 7.148148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3273794651031494, + "kl": 0.0006077333237044513, + "learning_rate": 1.155e-06, + "loss": -0.0781, + "num_tokens": 120963.0, + "reward": 2.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 3.5, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 7.166666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.8814287185668945, + "kl": 0.0002583457971923053, + "learning_rate": 1.158e-06, + "loss": 0.0313, + "num_tokens": 121230.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 23.25, + "completions/mean_terminated_length": 23.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 7.185185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.018314838409424, + "kl": 0.0004732194356620312, + "learning_rate": 1.161e-06, + "loss": 0.0882, + "num_tokens": 121567.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 7.203703703703703, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004768583457916975, + "kl": 7.208179499684775e-05, + "learning_rate": 1.164e-06, + "loss": 0.0, + "num_tokens": 121874.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 7.222222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00903977919369936, + "kl": 0.00011473968697828241, + "learning_rate": 1.167e-06, + "loss": 0.0, + "num_tokens": 122192.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 7.2407407407407405, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.260708808898926, + "kl": 0.0007341466553043574, + "learning_rate": 1.17e-06, + "loss": 0.0367, + "num_tokens": 122518.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 7.2592592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04728669673204422, + "kl": 0.002906686277128756, + "learning_rate": 1.173e-06, + "loss": 0.0002, + "num_tokens": 122802.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 7.277777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009743906557559967, + "kl": 0.00012787146988557652, + "learning_rate": 1.176e-06, + "loss": 0.0, + "num_tokens": 123059.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 34.5, + "completions/mean_terminated_length": 34.5, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 7.296296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.877381443977356, + "kl": 0.0004237563698552549, + "learning_rate": 1.179e-06, + "loss": -0.0545, + "num_tokens": 123477.0, + "reward": 0.875, + "reward_std": 1.4361406564712524, + "rewards/reward_combined/mean": 0.875, + "rewards/reward_combined/std": 1.4361406564712524, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 87.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 87.0, + "completions/mean_terminated_length": 87.0, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 7.314814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5418593883514404, + "kl": 0.00028770643984898925, + "learning_rate": 1.182e-06, + "loss": 0.346, + "num_tokens": 124049.0, + "reward": 5.175000190734863, + "reward_std": 4.650000095367432, + "rewards/reward_combined/mean": 5.175000190734863, + "rewards/reward_combined/std": 4.650000095367432, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 7.333333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6314048767089844, + "kl": 0.009780138731002808, + "learning_rate": 1.185e-06, + "loss": 0.0369, + "num_tokens": 124348.0, + "reward": 3.875, + "reward_std": 4.75, + "rewards/reward_combined/mean": 3.875, + "rewards/reward_combined/std": 4.75, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 7.351851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008155149407684803, + "kl": 0.0002987432962981984, + "learning_rate": 1.188e-06, + "loss": 0.0, + "num_tokens": 124673.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 7.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.014007522026077e-05, + "kl": 7.972121238708496e-07, + "learning_rate": 1.1910000000000001e-06, + "loss": 0.0, + "num_tokens": 124893.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 7.388888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.616152286529541, + "kl": 0.0004403176426421851, + "learning_rate": 1.1940000000000001e-06, + "loss": 0.1236, + "num_tokens": 125239.0, + "reward": 6.125, + "reward_std": 3.75, + "rewards/reward_combined/mean": 6.125, + "rewards/reward_combined/std": 3.75, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 7.407407407407407, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042800359427928925, + "kl": 0.0007947605045046657, + "learning_rate": 1.1970000000000001e-06, + "loss": 0.0, + "num_tokens": 125515.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 7.425925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0063782949000597, + "kl": 0.00010485592065379024, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.0, + "num_tokens": 125833.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 7.444444444444445, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.440243721008301, + "kl": 0.0008377085032407194, + "learning_rate": 1.2030000000000002e-06, + "loss": -0.0047, + "num_tokens": 126140.0, + "reward": 4.0, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 4.041451930999756, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 7.462962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.803163528442383, + "kl": 0.00040737282688496634, + "learning_rate": 1.2060000000000002e-06, + "loss": 0.0286, + "num_tokens": 126448.0, + "reward": 2.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 2.25, + "rewards/reward_combined/std": 3.5, + "step": 403 + }, + { + "clip_ratio/high_max": 0.010204081423580647, + "clip_ratio/high_mean": 0.010204081423580647, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.010204081423580647, + "completion_length": 29.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 29.5, + "completions/mean_terminated_length": 29.5, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 7.481481481481482, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.568783760070801, + "kl": 0.0010241676936857402, + "learning_rate": 1.2090000000000002e-06, + "loss": 0.125, + "num_tokens": 126782.0, + "reward": 1.75, + "reward_std": 2.1794495582580566, + "rewards/reward_combined/mean": 1.75, + "rewards/reward_combined/std": 2.1794495582580566, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 7.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.622627258300781, + "kl": 0.0013473780127242208, + "learning_rate": 1.2120000000000002e-06, + "loss": -0.0242, + "num_tokens": 127086.0, + "reward": 2.25, + "reward_std": 1.443375587463379, + "rewards/reward_combined/mean": 2.25, + "rewards/reward_combined/std": 1.4433757066726685, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 7.518518518518518, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.908332586288452, + "kl": 0.00019460863404674456, + "learning_rate": 1.215e-06, + "loss": 0.0158, + "num_tokens": 127379.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 7.537037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.318586587905884, + "kl": 0.000286047605186468, + "learning_rate": 1.218e-06, + "loss": 0.0681, + "num_tokens": 127657.0, + "reward": 4.5, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 4.041451930999756, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.5, + "completions/mean_terminated_length": 4.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 7.555555555555555, + "frac_reward_zero_std": 0.0, + "grad_norm": 10.478938102722168, + "kl": 0.002324142144061625, + "learning_rate": 1.221e-06, + "loss": 0.0181, + "num_tokens": 127875.0, + "reward": 3.125, + "reward_std": 1.4361406564712524, + "rewards/reward_combined/mean": 3.125, + "rewards/reward_combined/std": 1.4361406564712524, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 7.574074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6333165168762207, + "kl": 0.0003788674366660416, + "learning_rate": 1.224e-06, + "loss": 0.0038, + "num_tokens": 128157.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 7.592592592592593, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005975629203021526, + "kl": 4.957751730216842e-05, + "learning_rate": 1.2269999999999999e-06, + "loss": 0.0, + "num_tokens": 128453.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 7.611111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004418897908180952, + "kl": 5.195409289626696e-05, + "learning_rate": 1.2299999999999999e-06, + "loss": 0.0, + "num_tokens": 128819.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 74.75, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 74.75, + "completions/mean_terminated_length": 14.333333969116211, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 7.62962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.937324047088623, + "kl": 0.00043015779374400154, + "learning_rate": 1.2329999999999999e-06, + "loss": 0.4479, + "num_tokens": 129338.0, + "reward": 3.25, + "reward_std": 4.907477378845215, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 4.907477378845215, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 7.648148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.567517042160034, + "kl": 0.014520742231979966, + "learning_rate": 1.236e-06, + "loss": 0.1298, + "num_tokens": 129635.0, + "reward": 5.25, + "reward_std": 3.4034295082092285, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 3.4034297466278076, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 7.666666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019435608759522438, + "kl": 0.0003772839190787636, + "learning_rate": 1.239e-06, + "loss": 0.0, + "num_tokens": 129869.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 7.685185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.85819149017334, + "kl": 0.004227758385241032, + "learning_rate": 1.242e-06, + "loss": 0.0502, + "num_tokens": 130156.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 7.703703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.305883884429932, + "kl": 0.0005044145509600639, + "learning_rate": 1.245e-06, + "loss": 0.0039, + "num_tokens": 130446.0, + "reward": 2.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 3.5, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 7.722222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05890405923128128, + "kl": 0.001081001479178667, + "learning_rate": 1.248e-06, + "loss": 0.0001, + "num_tokens": 130708.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 7.7407407407407405, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6684036254882812, + "kl": 7.339137391682016e-05, + "learning_rate": 1.251e-06, + "loss": 0.0325, + "num_tokens": 130980.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 8.75, + "completions/mean_terminated_length": 8.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 7.7592592592592595, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.540816783905029, + "kl": 0.00023993071226868778, + "learning_rate": 1.254e-06, + "loss": 0.0785, + "num_tokens": 131243.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 7.777777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025105897337198257, + "kl": 0.0004859610344283283, + "learning_rate": 1.257e-06, + "loss": 0.0, + "num_tokens": 131566.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 0.5, + "rewards/reward_combined/std": 0.0, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 7.796296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00729712937027216, + "kl": 0.00019451748084975407, + "learning_rate": 1.26e-06, + "loss": 0.0, + "num_tokens": 131880.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 7.814814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10664273798465729, + "kl": 0.001314006745815277, + "learning_rate": 1.263e-06, + "loss": 0.0001, + "num_tokens": 132092.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 30.75, + "completions/mean_terminated_length": 30.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 7.833333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.205585479736328, + "kl": 0.0014551758067682385, + "learning_rate": 1.266e-06, + "loss": 0.2398, + "num_tokens": 132467.0, + "reward": 2.375, + "reward_std": 3.4247870445251465, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 3.4247870445251465, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 7.851851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014647755771875381, + "kl": 0.00028278891841182485, + "learning_rate": 1.269e-06, + "loss": 0.0, + "num_tokens": 132735.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 2.5, + "completions/mean_terminated_length": 2.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 7.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04798325151205063, + "kl": 0.0003249421715736389, + "learning_rate": 1.272e-06, + "loss": 0.0, + "num_tokens": 132941.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 7.888888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.873548984527588, + "kl": 0.0014190449146553874, + "learning_rate": 1.275e-06, + "loss": 0.0396, + "num_tokens": 133245.0, + "reward": 6.125, + "reward_std": 3.75, + "rewards/reward_combined/mean": 6.125, + "rewards/reward_combined/std": 3.75, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 7.907407407407407, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.278851509094238, + "kl": 0.007844563573598862, + "learning_rate": 1.278e-06, + "loss": 0.0466, + "num_tokens": 133507.0, + "reward": 4.0, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 4.041451930999756, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 7.925925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046332165598869324, + "kl": 0.0006397667457349598, + "learning_rate": 1.281e-06, + "loss": 0.0, + "num_tokens": 133756.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 26.25, + "completions/mean_terminated_length": 26.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 7.944444444444445, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.288076400756836, + "kl": 0.0013455498265102506, + "learning_rate": 1.284e-06, + "loss": 0.0601, + "num_tokens": 134089.0, + "reward": 3.5, + "reward_std": 2.915475845336914, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 2.915475845336914, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 7.962962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006751934066414833, + "kl": 0.00013109322026139125, + "learning_rate": 1.287e-06, + "loss": 0.0, + "num_tokens": 134351.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 7.981481481481482, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.11538553237915, + "kl": 0.0004287466290406883, + "learning_rate": 1.29e-06, + "loss": 0.3603, + "num_tokens": 134588.0, + "reward": 2.625, + "reward_std": 2.75, + "rewards/reward_combined/mean": 2.625, + "rewards/reward_combined/std": 2.75, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 8.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.785628795623779, + "kl": 0.0006086694047553465, + "learning_rate": 1.293e-06, + "loss": 0.1831, + "num_tokens": 134869.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 8.018518518518519, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.176998615264893, + "kl": 0.001978690270334482, + "learning_rate": 1.296e-06, + "loss": -0.0606, + "num_tokens": 135158.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 8.037037037037036, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013280250132083893, + "kl": 0.0003153277466481086, + "learning_rate": 1.299e-06, + "loss": 0.0, + "num_tokens": 135465.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 75.25, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 75.25, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 8.055555555555555, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.434341907501221, + "kl": 0.0005841281672473997, + "learning_rate": 1.302e-06, + "loss": 0.3676, + "num_tokens": 135982.0, + "reward": 4.0, + "reward_std": 4.690415859222412, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 4.690415859222412, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 8.074074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.653457164764404, + "kl": 0.0009990200342144817, + "learning_rate": 1.305e-06, + "loss": -0.068, + "num_tokens": 136246.0, + "reward": 2.0, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.0, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 8.092592592592593, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.002219200134277, + "kl": 0.0002858766383724287, + "learning_rate": 1.308e-06, + "loss": 0.1682, + "num_tokens": 136530.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 85.25, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 85.25, + "completions/mean_terminated_length": 28.33333396911621, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 8.11111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5005788803100586, + "kl": 0.001248143264092505, + "learning_rate": 1.311e-06, + "loss": 0.2577, + "num_tokens": 137123.0, + "reward": 2.924999952316284, + "reward_std": 3.3944807052612305, + "rewards/reward_combined/mean": 2.924999952316284, + "rewards/reward_combined/std": 3.3944807052612305, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 62.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 147.0, + "completions/max_terminated_length": 147.0, + "completions/mean_length": 62.5, + "completions/mean_terminated_length": 62.5, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 8.12962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4210071563720703, + "kl": 0.001322774973232299, + "learning_rate": 1.314e-06, + "loss": 0.0425, + "num_tokens": 137601.0, + "reward": 4.550000190734863, + "reward_std": 3.07300066947937, + "rewards/reward_combined/mean": 4.550000190734863, + "rewards/reward_combined/std": 3.07300066947937, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 8.148148148148149, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.893808364868164, + "kl": 0.001215376891195774, + "learning_rate": 1.317e-06, + "loss": -0.0023, + "num_tokens": 137905.0, + "reward": 6.75, + "reward_std": 2.5, + "rewards/reward_combined/mean": 6.75, + "rewards/reward_combined/std": 2.5, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 8.166666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039437662810087204, + "kl": 0.0012664295791182667, + "learning_rate": 1.32e-06, + "loss": 0.0001, + "num_tokens": 138235.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 23.75, + "completions/mean_terminated_length": 23.75, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 8.185185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2584749460220337, + "kl": 7.745823495497461e-06, + "learning_rate": 1.323e-06, + "loss": 0.0013, + "num_tokens": 138598.0, + "reward": 1.625, + "reward_std": 1.25, + "rewards/reward_combined/mean": 1.625, + "rewards/reward_combined/std": 1.25, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 8.203703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02892359346151352, + "kl": 0.0005071528576081619, + "learning_rate": 1.326e-06, + "loss": 0.0, + "num_tokens": 138864.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 8.222222222222221, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.531430721282959, + "kl": 0.00043822579027619213, + "learning_rate": 1.3290000000000001e-06, + "loss": -0.0243, + "num_tokens": 139166.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 12.25, + "completions/mean_terminated_length": 12.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 8.24074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.594545841217041, + "kl": 0.00045026788575341925, + "learning_rate": 1.3320000000000001e-06, + "loss": 0.0932, + "num_tokens": 139439.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 8.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043402764946222305, + "kl": 0.0017976841118070297, + "learning_rate": 1.3350000000000001e-06, + "loss": 0.0001, + "num_tokens": 139729.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 8.277777777777779, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8853728771209717, + "kl": 0.009266020730137825, + "learning_rate": 1.3380000000000001e-06, + "loss": 0.0311, + "num_tokens": 140021.0, + "reward": 5.25, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 4.5, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 8.296296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.021646499633789, + "kl": 0.01693438400980085, + "learning_rate": 1.3410000000000002e-06, + "loss": 0.0839, + "num_tokens": 140283.0, + "reward": 7.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 7.25, + "rewards/reward_combined/std": 1.5, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 8.314814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017803365364670753, + "kl": 0.0006827447505202144, + "learning_rate": 1.344e-06, + "loss": 0.0, + "num_tokens": 140606.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 8.333333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.326924800872803, + "kl": 0.0030778807904425776, + "learning_rate": 1.347e-06, + "loss": 0.0366, + "num_tokens": 140880.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 8.351851851851851, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021769413724541664, + "kl": 0.0008617108105681837, + "learning_rate": 1.35e-06, + "loss": 0.0001, + "num_tokens": 141227.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 8.37037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3093459606170654, + "kl": 0.00042448812746442854, + "learning_rate": 1.353e-06, + "loss": 0.1012, + "num_tokens": 141491.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 8.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03047255426645279, + "kl": 0.0015327175497077405, + "learning_rate": 1.356e-06, + "loss": 0.0001, + "num_tokens": 141772.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 8.407407407407407, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3602254390716553, + "kl": 0.00742534501478076, + "learning_rate": 1.359e-06, + "loss": 0.1568, + "num_tokens": 142068.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 8.425925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011161944828927517, + "kl": 0.00017811357975006104, + "learning_rate": 1.362e-06, + "loss": 0.0, + "num_tokens": 142280.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 8.444444444444445, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.692661762237549, + "kl": 0.0005987969925627112, + "learning_rate": 1.365e-06, + "loss": -0.0898, + "num_tokens": 142544.0, + "reward": 5.75, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 2.598076105117798, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 8.462962962962964, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03104279935359955, + "kl": 0.00034093111753463745, + "learning_rate": 1.368e-06, + "loss": 0.0, + "num_tokens": 142764.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 8.481481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05662114918231964, + "kl": 0.001474709075409919, + "learning_rate": 1.371e-06, + "loss": 0.0001, + "num_tokens": 142996.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 8.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.059370517730713, + "kl": 0.0018500362057238817, + "learning_rate": 1.374e-06, + "loss": 0.1937, + "num_tokens": 143269.0, + "reward": 7.0, + "reward_std": 1.0, + "rewards/reward_combined/mean": 7.0, + "rewards/reward_combined/std": 1.0, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 8.518518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037880558520555496, + "kl": 0.000834152102470398, + "learning_rate": 1.3770000000000001e-06, + "loss": 0.0, + "num_tokens": 143477.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 31.75, + "completions/mean_terminated_length": 31.75, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 8.537037037037036, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1003763675689697, + "kl": 0.0006077908328734338, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.0427, + "num_tokens": 143884.0, + "reward": 0.125, + "reward_std": 0.25, + "rewards/reward_combined/mean": 0.125, + "rewards/reward_combined/std": 0.25, + "step": 461 + }, + { + "clip_ratio/high_max": 0.013513513840734959, + "clip_ratio/high_mean": 0.013513513840734959, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.013513513840734959, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 8.555555555555555, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5658528804779053, + "kl": 0.030395436100661755, + "learning_rate": 1.3830000000000001e-06, + "loss": 0.0682, + "num_tokens": 144186.0, + "reward": 3.5, + "reward_std": 3.488075017929077, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 3.488075017929077, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 8.574074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026647405698895454, + "kl": 0.00041704023715283256, + "learning_rate": 1.3860000000000002e-06, + "loss": 0.0, + "num_tokens": 144420.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 39.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 39.5, + "completions/mean_terminated_length": 39.5, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 8.592592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.35224986076355, + "kl": 0.0005700888286810368, + "learning_rate": 1.3890000000000002e-06, + "loss": 0.2826, + "num_tokens": 144802.0, + "reward": 1.75, + "reward_std": 1.443375587463379, + "rewards/reward_combined/mean": 1.75, + "rewards/reward_combined/std": 1.4433757066726685, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 12.75, + "completions/mean_terminated_length": 12.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 8.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.050783157348633, + "kl": 0.0005874587732250802, + "learning_rate": 1.3920000000000002e-06, + "loss": 0.1148, + "num_tokens": 145105.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 35.25, + "completions/mean_terminated_length": 35.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 8.62962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.94515323638916, + "kl": 0.0006666499830316752, + "learning_rate": 1.3950000000000002e-06, + "loss": -0.1806, + "num_tokens": 145466.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 8.648148148148149, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016897855093702674, + "kl": 4.260614514350891e-05, + "learning_rate": 1.3980000000000002e-06, + "loss": 0.0, + "num_tokens": 145710.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 29.0, + "completions/mean_terminated_length": 29.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 8.666666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.4314656257629395, + "kl": 0.0016374941042158753, + "learning_rate": 1.401e-06, + "loss": 0.1133, + "num_tokens": 146066.0, + "reward": 0.625, + "reward_std": 0.25, + "rewards/reward_combined/mean": 0.625, + "rewards/reward_combined/std": 0.25, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 8.685185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.6874823570251465, + "kl": 0.0025886285584419966, + "learning_rate": 1.404e-06, + "loss": 0.014, + "num_tokens": 146380.0, + "reward": 4.625, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 29.25, + "completions/mean_terminated_length": 29.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 8.703703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.0305562019348145, + "kl": 0.0030521515873260796, + "learning_rate": 1.407e-06, + "loss": 0.1832, + "num_tokens": 146725.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 32.75, + "completions/mean_terminated_length": 32.75, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 8.722222222222221, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.66194224357605, + "kl": 0.0007765647023916245, + "learning_rate": 1.41e-06, + "loss": 0.0002, + "num_tokens": 147092.0, + "reward": 1.5, + "reward_std": 1.3540064096450806, + "rewards/reward_combined/mean": 1.5, + "rewards/reward_combined/std": 1.3540064096450806, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 8.74074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3744161128997803, + "kl": 0.0008247126825153828, + "learning_rate": 1.4129999999999999e-06, + "loss": 0.1106, + "num_tokens": 147377.0, + "reward": 5.875, + "reward_std": 3.25, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 3.25, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 31.5, + "completions/mean_terminated_length": 31.5, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 8.75925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.0727128982543945, + "kl": 0.001286379061639309, + "learning_rate": 1.4159999999999999e-06, + "loss": 0.0786, + "num_tokens": 147719.0, + "reward": 1.125, + "reward_std": 1.8427786827087402, + "rewards/reward_combined/mean": 1.125, + "rewards/reward_combined/std": 1.8427786827087402, + "step": 473 + }, + { + "clip_ratio/high_max": 0.02500000037252903, + "clip_ratio/high_mean": 0.02500000037252903, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02500000037252903, + "completion_length": 10.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.75, + "completions/mean_terminated_length": 10.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 8.777777777777779, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.285005569458008, + "kl": 0.0018552718393038958, + "learning_rate": 1.4189999999999999e-06, + "loss": 0.0381, + "num_tokens": 147986.0, + "reward": 4.625, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 8.796296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.263662815093994, + "kl": 0.001372149446979165, + "learning_rate": 1.422e-06, + "loss": 0.0716, + "num_tokens": 148291.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.5, + "completions/mean_terminated_length": 4.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 8.814814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03566886484622955, + "kl": 0.0007755518017802387, + "learning_rate": 1.425e-06, + "loss": 0.0, + "num_tokens": 148509.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 8.833333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018361331894993782, + "kl": 0.00014239762822398916, + "learning_rate": 1.428e-06, + "loss": 0.0, + "num_tokens": 148789.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 8.851851851851851, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03665432706475258, + "kl": 0.0007165893621277064, + "learning_rate": 1.431e-06, + "loss": 0.0, + "num_tokens": 149045.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 80.5, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 80.5, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 8.87037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.69806706905365, + "kl": 0.0013292425137478858, + "learning_rate": 1.434e-06, + "loss": 0.4274, + "num_tokens": 149619.0, + "reward": 5.25, + "reward_std": 5.5, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 5.5, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 8.88888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.148035526275635, + "kl": 0.01693603489547968, + "learning_rate": 1.437e-06, + "loss": 0.0869, + "num_tokens": 149884.0, + "reward": 4.625, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 8.907407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.436980724334717, + "kl": 0.002011268137721345, + "learning_rate": 1.44e-06, + "loss": -0.046, + "num_tokens": 150194.0, + "reward": 4.5, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 4.041451930999756, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 8.925925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014376312494277954, + "kl": 0.0006001824513077736, + "learning_rate": 1.443e-06, + "loss": 0.0, + "num_tokens": 150506.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 8.944444444444445, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008478103205561638, + "kl": 0.000158771472342778, + "learning_rate": 1.446e-06, + "loss": 0.0, + "num_tokens": 150831.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 8.962962962962964, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0075084068812429905, + "kl": 0.0004975050687789917, + "learning_rate": 1.449e-06, + "loss": 0.0, + "num_tokens": 151067.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 8.981481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021673977375030518, + "kl": 0.0014037236687727273, + "learning_rate": 1.452e-06, + "loss": 0.0001, + "num_tokens": 151362.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 2.5, + "completions/mean_terminated_length": 2.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 9.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06421205401420593, + "kl": 0.0022028908133506775, + "learning_rate": 1.455e-06, + "loss": 0.0001, + "num_tokens": 151576.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 9.018518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010776751674711704, + "kl": 0.00019974775204900652, + "learning_rate": 1.458e-06, + "loss": 0.0, + "num_tokens": 151897.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 9.037037037037036, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005626564845442772, + "kl": 0.00015673040979891084, + "learning_rate": 1.461e-06, + "loss": 0.0, + "num_tokens": 152193.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 9.055555555555555, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0110141821205616, + "kl": 0.0002580478831077926, + "learning_rate": 1.464e-06, + "loss": 0.0, + "num_tokens": 152449.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 72.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 147.0, + "completions/max_terminated_length": 147.0, + "completions/mean_length": 72.5, + "completions/mean_terminated_length": 72.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 9.074074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9944734573364258, + "kl": 0.0009289936569985002, + "learning_rate": 1.467e-06, + "loss": 0.4543, + "num_tokens": 152963.0, + "reward": 2.674999952316284, + "reward_std": 5.974040985107422, + "rewards/reward_combined/mean": 2.674999952316284, + "rewards/reward_combined/std": 5.974040985107422, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 9.092592592592593, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14732055366039276, + "kl": 0.004145276732742786, + "learning_rate": 1.4700000000000001e-06, + "loss": 0.0002, + "num_tokens": 153237.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 9.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05999904125928879, + "kl": 0.0030060313874855638, + "learning_rate": 1.473e-06, + "loss": 0.0002, + "num_tokens": 153507.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 9.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005623538745567203, + "kl": 1.4913433005858678e-05, + "learning_rate": 1.476e-06, + "loss": 0.0, + "num_tokens": 153815.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 9.148148148148149, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.0974884033203125, + "kl": 0.001276601484278217, + "learning_rate": 1.479e-06, + "loss": 0.3508, + "num_tokens": 154077.0, + "reward": 3.125, + "reward_std": 1.75, + "rewards/reward_combined/mean": 3.125, + "rewards/reward_combined/std": 1.75, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.006172839552164078, + "clip_ratio/low_min": 0.006172839552164078, + "clip_ratio/region_mean": 0.006172839552164078, + "completion_length": 36.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 36.75, + "completions/mean_terminated_length": 36.75, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 9.166666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2115442752838135, + "kl": 0.02928131737280637, + "learning_rate": 1.482e-06, + "loss": -0.0329, + "num_tokens": 154504.0, + "reward": -0.8250000476837158, + "reward_std": 2.3286263942718506, + "rewards/reward_combined/mean": -0.8250000476837158, + "rewards/reward_combined/std": 2.3286263942718506, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3.0, + "completions/max_terminated_length": 3.0, + "completions/mean_length": 2.25, + "completions/mean_terminated_length": 2.25, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 9.185185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 13.164260864257812, + "kl": 0.017533445730805397, + "learning_rate": 1.485e-06, + "loss": 0.0917, + "num_tokens": 154717.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 100.75, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 100.75, + "completions/mean_terminated_length": 49.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 9.203703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6346595287323, + "kl": 0.001183508342364803, + "learning_rate": 1.488e-06, + "loss": -0.1579, + "num_tokens": 155344.0, + "reward": 1.5499999523162842, + "reward_std": 4.050926208496094, + "rewards/reward_combined/mean": 1.5499999523162842, + "rewards/reward_combined/std": 4.0509257316589355, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 9.222222222222221, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016270643391180784, + "kl": 3.6284327507019043e-06, + "learning_rate": 1.491e-06, + "loss": 0.0, + "num_tokens": 155564.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 9.24074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.976415634155273, + "kl": 0.001442478969693184, + "learning_rate": 1.494e-06, + "loss": 0.1614, + "num_tokens": 155884.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 9.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01832779496908188, + "kl": 0.0006520260430988856, + "learning_rate": 1.497e-06, + "loss": 0.0, + "num_tokens": 156198.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 9.277777777777779, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001106397365219891, + "kl": 1.572271139593795e-05, + "learning_rate": 1.5e-06, + "loss": 0.0, + "num_tokens": 156468.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 9.296296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.436293363571167, + "kl": 0.001056065782904625, + "learning_rate": 1.503e-06, + "loss": 0.033, + "num_tokens": 156763.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 24.25, + "completions/mean_terminated_length": 24.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 9.314814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.309901237487793, + "kl": 0.0021982649923302233, + "learning_rate": 1.506e-06, + "loss": 0.1451, + "num_tokens": 157100.0, + "reward": 2.5, + "reward_std": 3.674234628677368, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 3.674234628677368, + "step": 503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 9.333333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007930577732622623, + "kl": 0.0005407258868217468, + "learning_rate": 1.509e-06, + "loss": 0.0, + "num_tokens": 157336.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 9.351851851851851, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4427762031555176, + "kl": 0.00042985317577404203, + "learning_rate": 1.512e-06, + "loss": 0.0002, + "num_tokens": 157700.0, + "reward": 1.625, + "reward_std": 1.25, + "rewards/reward_combined/mean": 1.625, + "rewards/reward_combined/std": 1.25, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 9.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011734436266124249, + "kl": 0.0004800920287379995, + "learning_rate": 1.5150000000000001e-06, + "loss": 0.0, + "num_tokens": 158032.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 9.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07170011103153229, + "kl": 0.005940357630606741, + "learning_rate": 1.5180000000000001e-06, + "loss": 0.0003, + "num_tokens": 158315.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 9.407407407407407, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.212244987487793, + "kl": 0.005429249256849289, + "learning_rate": 1.5210000000000001e-06, + "loss": 0.0288, + "num_tokens": 158619.0, + "reward": 4.625, + "reward_std": 2.25, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 2.25, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 9.425925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009870851412415504, + "kl": 0.0002587953640613705, + "learning_rate": 1.5240000000000001e-06, + "loss": 0.0, + "num_tokens": 158949.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 9.444444444444445, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.560111999511719, + "kl": 0.002522015245631337, + "learning_rate": 1.5270000000000002e-06, + "loss": -0.0123, + "num_tokens": 159242.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 9.462962962962964, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.170704364776611, + "kl": 0.002358070865739137, + "learning_rate": 1.53e-06, + "loss": -0.0019, + "num_tokens": 159547.0, + "reward": 1.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 1.75, + "rewards/reward_combined/std": 1.5, + "step": 511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 9.481481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015831032767891884, + "kl": 0.0006280697125475854, + "learning_rate": 1.533e-06, + "loss": 0.0, + "num_tokens": 159830.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 9.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.864472389221191, + "kl": 0.003002442157594487, + "learning_rate": 1.536e-06, + "loss": 0.1529, + "num_tokens": 160170.0, + "reward": 3.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 3.875, + "rewards/reward_combined/std": 0.25, + "step": 513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 9.518518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05545349791646004, + "kl": 0.0025941634085029364, + "learning_rate": 1.539e-06, + "loss": 0.0001, + "num_tokens": 160462.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 37.5, + "completions/mean_terminated_length": 37.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 9.537037037037036, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017333369702100754, + "kl": 0.0011692616099026054, + "learning_rate": 1.542e-06, + "loss": 0.0001, + "num_tokens": 160832.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 9.555555555555555, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05681229010224342, + "kl": 0.0010110288858413696, + "learning_rate": 1.545e-06, + "loss": 0.0001, + "num_tokens": 161092.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 82.75, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 82.75, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 9.574074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5500227212905884, + "kl": 0.00148715078830719, + "learning_rate": 1.548e-06, + "loss": 0.4173, + "num_tokens": 161651.0, + "reward": 5.75, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 4.5, + "step": 517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 9.592592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0019148801220580935, + "kl": 0.00011230686141061597, + "learning_rate": 1.551e-06, + "loss": 0.0, + "num_tokens": 161931.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 9.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5351691246032715, + "kl": 0.6368295695865527, + "learning_rate": 1.554e-06, + "loss": 0.0305, + "num_tokens": 162229.0, + "reward": 7.0, + "reward_std": 2.0, + "rewards/reward_combined/mean": 7.0, + "rewards/reward_combined/std": 2.0, + "step": 519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 9.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008595379069447517, + "kl": 0.00012179091572761536, + "learning_rate": 1.557e-06, + "loss": 0.0, + "num_tokens": 162441.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 9.648148148148149, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016889087855815887, + "kl": 0.0012415878591127694, + "learning_rate": 1.56e-06, + "loss": 0.0001, + "num_tokens": 162723.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 9.666666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.879021167755127, + "kl": 0.0016143560060299933, + "learning_rate": 1.5630000000000001e-06, + "loss": -0.0411, + "num_tokens": 162991.0, + "reward": 6.75, + "reward_std": 2.5, + "rewards/reward_combined/mean": 6.75, + "rewards/reward_combined/std": 2.5, + "step": 522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 9.685185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.694362163543701, + "kl": 0.0008695534197613597, + "learning_rate": 1.5660000000000001e-06, + "loss": 0.1415, + "num_tokens": 163282.0, + "reward": 5.875, + "reward_std": 3.5910768508911133, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 3.5910770893096924, + "step": 523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 9.703703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.453367710113525, + "kl": 0.002435041998978704, + "learning_rate": 1.5690000000000001e-06, + "loss": 0.4561, + "num_tokens": 163554.0, + "reward": 2.375, + "reward_std": 1.8874585628509521, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 1.8874585628509521, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 9.722222222222221, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02079276740550995, + "kl": 0.001192980445921421, + "learning_rate": 1.5720000000000002e-06, + "loss": 0.0001, + "num_tokens": 163882.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 9.74074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.8610076904296875, + "kl": 0.0023750447726342827, + "learning_rate": 1.5750000000000002e-06, + "loss": 0.1223, + "num_tokens": 164154.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 29.25, + "completions/mean_terminated_length": 29.25, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 9.75925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8834428787231445, + "kl": 0.0032023880630731583, + "learning_rate": 1.5780000000000002e-06, + "loss": 0.0633, + "num_tokens": 164499.0, + "reward": 2.5, + "reward_std": 1.0, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.0, + "step": 527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 33.5, + "completions/mean_terminated_length": 33.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 9.777777777777779, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.78782320022583, + "kl": 0.0025741655263118446, + "learning_rate": 1.5810000000000002e-06, + "loss": 0.0926, + "num_tokens": 164869.0, + "reward": 3.049999952316284, + "reward_std": 0.33166250586509705, + "rewards/reward_combined/mean": 3.049999952316284, + "rewards/reward_combined/std": 0.33166247606277466, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 9.796296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.27029284834861755, + "kl": 0.009208195144310594, + "learning_rate": 1.5840000000000002e-06, + "loss": 0.0005, + "num_tokens": 165127.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 9.814814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014770339243113995, + "kl": 0.00014358200132846832, + "learning_rate": 1.5870000000000002e-06, + "loss": 0.0, + "num_tokens": 165371.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 9.833333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0294018667191267, + "kl": 0.0008226931531680748, + "learning_rate": 1.59e-06, + "loss": 0.0, + "num_tokens": 165631.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 9.851851851851851, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01135218609124422, + "kl": 0.0005414411425590515, + "learning_rate": 1.593e-06, + "loss": 0.0, + "num_tokens": 165841.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 57.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 181.0, + "completions/max_terminated_length": 181.0, + "completions/mean_length": 57.0, + "completions/mean_terminated_length": 57.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 9.87037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.112264156341553, + "kl": 0.0026307841762900352, + "learning_rate": 1.596e-06, + "loss": 0.131, + "num_tokens": 166285.0, + "reward": 1.5499999523162842, + "reward_std": 4.859012126922607, + "rewards/reward_combined/mean": 1.5499999523162842, + "rewards/reward_combined/std": 4.859012126922607, + "step": 533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 9.88888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.435405731201172, + "kl": 0.021325815469026566, + "learning_rate": 1.599e-06, + "loss": -0.0024, + "num_tokens": 166573.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 29.0, + "completions/mean_terminated_length": 29.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 9.907407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5232558250427246, + "kl": 0.040265748859383166, + "learning_rate": 1.602e-06, + "loss": -0.152, + "num_tokens": 166941.0, + "reward": 2.25, + "reward_std": 4.051748752593994, + "rewards/reward_combined/mean": 2.25, + "rewards/reward_combined/std": 4.051748752593994, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 9.75, + "completions/mean_terminated_length": 9.75, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 9.925925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.802764892578125, + "kl": 0.010558367241173983, + "learning_rate": 1.605e-06, + "loss": 0.071, + "num_tokens": 167204.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 9.944444444444445, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.850037097930908, + "kl": 0.0031486726365983486, + "learning_rate": 1.608e-06, + "loss": 0.0171, + "num_tokens": 167496.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.5, + "completions/mean_terminated_length": 5.5, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 9.962962962962964, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04560142755508423, + "kl": 0.0008016876163310371, + "learning_rate": 1.6110000000000001e-06, + "loss": 0.0, + "num_tokens": 167718.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 9.981481481481481, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.351009845733643, + "kl": 0.005056664114817977, + "learning_rate": 1.6140000000000001e-06, + "loss": -0.0552, + "num_tokens": 167984.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 10.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.612298965454102, + "kl": 0.0034471265971660614, + "learning_rate": 1.6170000000000001e-06, + "loss": 0.1006, + "num_tokens": 168326.0, + "reward": 6.75, + "reward_std": 2.5, + "rewards/reward_combined/mean": 6.75, + "rewards/reward_combined/std": 2.5, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 10.018518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0329267680644989, + "kl": 0.0011048528831452131, + "learning_rate": 1.6200000000000002e-06, + "loss": 0.0001, + "num_tokens": 168630.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 10.037037037037036, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.368251323699951, + "kl": 0.00031063980713952333, + "learning_rate": 1.6230000000000002e-06, + "loss": 0.0412, + "num_tokens": 168928.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 75.75, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 75.75, + "completions/mean_terminated_length": 15.666666984558105, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 10.055555555555555, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2446129322052, + "kl": 0.002881466527469456, + "learning_rate": 1.6260000000000002e-06, + "loss": 0.4622, + "num_tokens": 169451.0, + "reward": 7.300000190734863, + "reward_std": 0.40000009536743164, + "rewards/reward_combined/mean": 7.300000190734863, + "rewards/reward_combined/std": 0.40000009536743164, + "step": 543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 74.5, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 74.5, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 10.074074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5643595457077026, + "kl": 0.00021576111612375826, + "learning_rate": 1.6290000000000002e-06, + "loss": 0.4485, + "num_tokens": 169973.0, + "reward": 5.625, + "reward_std": 4.75, + "rewards/reward_combined/mean": 5.625, + "rewards/reward_combined/std": 4.75, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 23.25, + "completions/mean_terminated_length": 23.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 10.092592592592593, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.54982328414917, + "kl": 0.001731416559778154, + "learning_rate": 1.6320000000000002e-06, + "loss": 0.1921, + "num_tokens": 170286.0, + "reward": 0.25, + "reward_std": 0.8660253882408142, + "rewards/reward_combined/mean": 0.25, + "rewards/reward_combined/std": 0.8660253882408142, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 10.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2725299596786499, + "kl": 0.012912555976072326, + "learning_rate": 1.6350000000000002e-06, + "loss": 0.0006, + "num_tokens": 170544.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 10.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035415973514318466, + "kl": 0.0003276318311691284, + "learning_rate": 1.6380000000000002e-06, + "loss": 0.0, + "num_tokens": 170754.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 10.148148148148149, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.413743257522583, + "kl": 0.04540968965739012, + "learning_rate": 1.6410000000000003e-06, + "loss": 0.0366, + "num_tokens": 171040.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 10.166666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03200657293200493, + "kl": 0.001019798728520982, + "learning_rate": 1.6440000000000003e-06, + "loss": 0.0001, + "num_tokens": 171307.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.0, + "completions/max_terminated_length": 46.0, + "completions/mean_length": 27.5, + "completions/mean_terminated_length": 27.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 10.185185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4697763919830322, + "kl": 0.0023434842005372047, + "learning_rate": 1.647e-06, + "loss": 0.1577, + "num_tokens": 171669.0, + "reward": 5.375, + "reward_std": 2.462214469909668, + "rewards/reward_combined/mean": 5.375, + "rewards/reward_combined/std": 2.462214469909668, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 10.203703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13165704905986786, + "kl": 0.019468783400952816, + "learning_rate": 1.65e-06, + "loss": 0.001, + "num_tokens": 171963.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 34.0, + "completions/mean_terminated_length": 34.0, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 10.222222222222221, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029760370030999184, + "kl": 0.0029473998583853245, + "learning_rate": 1.653e-06, + "loss": 0.0002, + "num_tokens": 172323.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.0, + "completions/mean_terminated_length": 7.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 10.24074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 14.334692001342773, + "kl": 0.0014246180653572083, + "learning_rate": 1.6560000000000001e-06, + "loss": -0.1715, + "num_tokens": 172563.0, + "reward": 3.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 3.875, + "rewards/reward_combined/std": 0.25, + "step": 553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 10.25925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.9498836994171143, + "kl": 0.0051345787942409515, + "learning_rate": 1.6590000000000001e-06, + "loss": -0.0792, + "num_tokens": 172850.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 10.277777777777779, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02657780796289444, + "kl": 0.0018518269062042236, + "learning_rate": 1.6620000000000001e-06, + "loss": 0.0001, + "num_tokens": 173062.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 10.296296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.734173059463501, + "kl": 0.021748999832198024, + "learning_rate": 1.6650000000000002e-06, + "loss": 0.108, + "num_tokens": 173417.0, + "reward": 5.0, + "reward_std": 3.5590262413024902, + "rewards/reward_combined/mean": 5.0, + "rewards/reward_combined/std": 3.5590262413024902, + "step": 556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 10.314814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04306991398334503, + "kl": 0.001214335861732252, + "learning_rate": 1.6680000000000002e-06, + "loss": 0.0001, + "num_tokens": 173673.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 10.333333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013166028074920177, + "kl": 0.0011980346753261983, + "learning_rate": 1.6710000000000002e-06, + "loss": 0.0001, + "num_tokens": 173955.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 10.351851851851851, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017330129630863667, + "kl": 7.345527410507202e-05, + "learning_rate": 1.6740000000000002e-06, + "loss": 0.0, + "num_tokens": 174227.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 559 + }, + { + "clip_ratio/high_max": 0.021276595070958138, + "clip_ratio/high_mean": 0.021276595070958138, + "clip_ratio/low_mean": 0.007692307699471712, + "clip_ratio/low_min": 0.007692307699471712, + "clip_ratio/region_mean": 0.02896890277042985, + "completion_length": 28.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 28.0, + "completions/mean_terminated_length": 28.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 10.37037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.486509323120117, + "kl": 0.0069011535961180925, + "learning_rate": 1.6770000000000002e-06, + "loss": 0.0458, + "num_tokens": 174555.0, + "reward": 2.75, + "reward_std": 1.1902379989624023, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.190238118171692, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 67.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 67.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 10.38888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8934012651443481, + "kl": 0.0007226394955068827, + "learning_rate": 1.6800000000000002e-06, + "loss": 0.4866, + "num_tokens": 175043.0, + "reward": 2.549999952316284, + "reward_std": 2.9000000953674316, + "rewards/reward_combined/mean": 2.549999952316284, + "rewards/reward_combined/std": 2.8999998569488525, + "step": 561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 10.407407407407407, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011858646757900715, + "kl": 0.0002608560025691986, + "learning_rate": 1.6830000000000002e-06, + "loss": 0.0, + "num_tokens": 175255.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 10.425925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03355216234922409, + "kl": 0.0005901605036342517, + "learning_rate": 1.6860000000000002e-06, + "loss": 0.0, + "num_tokens": 175474.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 10.444444444444445, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023276211693882942, + "kl": 0.000884422188391909, + "learning_rate": 1.6889999999999998e-06, + "loss": 0.0, + "num_tokens": 175747.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 10.462962962962964, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.79395055770874, + "kl": 0.0033861820120364428, + "learning_rate": 1.6919999999999999e-06, + "loss": 0.1825, + "num_tokens": 176022.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 10.481481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05285676568746567, + "kl": 0.0013320036232471466, + "learning_rate": 1.6949999999999999e-06, + "loss": 0.0001, + "num_tokens": 176282.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 10.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2973270416259766, + "kl": 0.000938477780437097, + "learning_rate": 1.6979999999999999e-06, + "loss": -0.0155, + "num_tokens": 176603.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 10.518518518518519, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.575317144393921, + "kl": 0.0024951985105872154, + "learning_rate": 1.7009999999999999e-06, + "loss": 0.1103, + "num_tokens": 176924.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 10.537037037037036, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.342179298400879, + "kl": 0.0016367121716029942, + "learning_rate": 1.704e-06, + "loss": -0.0965, + "num_tokens": 177218.0, + "reward": 4.0, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 4.041451930999756, + "step": 569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 27.0, + "completions/mean_terminated_length": 27.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 10.555555555555555, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.08101749420166, + "kl": 0.0023249993100762367, + "learning_rate": 1.707e-06, + "loss": -0.2064, + "num_tokens": 177566.0, + "reward": 0.625, + "reward_std": 0.25, + "rewards/reward_combined/mean": 0.625, + "rewards/reward_combined/std": 0.25, + "step": 570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 10.574074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020238690078258514, + "kl": 0.001071081671398133, + "learning_rate": 1.71e-06, + "loss": 0.0001, + "num_tokens": 177844.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 10.592592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5214464664459229, + "kl": 0.0064206772949546576, + "learning_rate": 1.713e-06, + "loss": 0.0, + "num_tokens": 178208.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 32.25, + "completions/mean_terminated_length": 32.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 10.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.121870279312134, + "kl": 0.0037148026167415082, + "learning_rate": 1.716e-06, + "loss": 0.1631, + "num_tokens": 178573.0, + "reward": 3.0, + "reward_std": 3.316624879837036, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 3.316624879837036, + "step": 573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 32.5, + "completions/mean_terminated_length": 32.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 10.62962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.407678127288818, + "kl": 0.005521014798432589, + "learning_rate": 1.719e-06, + "loss": 0.2429, + "num_tokens": 178923.0, + "reward": 5.875, + "reward_std": 3.25, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 3.25, + "step": 574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 24.75, + "completions/mean_terminated_length": 24.75, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 10.648148148148149, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.755073547363281, + "kl": 0.0037983747897669673, + "learning_rate": 1.722e-06, + "loss": -0.0337, + "num_tokens": 179242.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 10.666666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1125705242156982, + "kl": 0.0015287879505194724, + "learning_rate": 1.725e-06, + "loss": 0.0503, + "num_tokens": 179574.0, + "reward": 2.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 2.25, + "rewards/reward_combined/std": 3.5, + "step": 576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 10.685185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.040618419647217, + "kl": 0.003923992975614965, + "learning_rate": 1.728e-06, + "loss": 0.0309, + "num_tokens": 179854.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 33.75, + "completions/mean_terminated_length": 33.75, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 10.703703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.314767837524414, + "kl": 0.005306090926751494, + "learning_rate": 1.7309999999999998e-06, + "loss": 0.0052, + "num_tokens": 180217.0, + "reward": 4.625, + "reward_std": 4.308422088623047, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 4.308422088623047, + "step": 578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 10.722222222222221, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026743585243821144, + "kl": 0.0008531607345503289, + "learning_rate": 1.7339999999999998e-06, + "loss": 0.0, + "num_tokens": 180525.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 579 + }, + { + "clip_ratio/high_max": 0.011363636702299118, + "clip_ratio/high_mean": 0.011363636702299118, + "clip_ratio/low_mean": 0.009803921915590763, + "clip_ratio/low_min": 0.009803921915590763, + "clip_ratio/region_mean": 0.02116755861788988, + "completion_length": 23.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 23.75, + "completions/mean_terminated_length": 23.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 10.74074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.088840007781982, + "kl": 0.0027468246989883482, + "learning_rate": 1.7369999999999998e-06, + "loss": 0.1058, + "num_tokens": 180868.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 10.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021816635853610933, + "kl": 5.990266799926758e-06, + "learning_rate": 1.7399999999999999e-06, + "loss": 0.0, + "num_tokens": 181088.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 28.5, + "completions/mean_terminated_length": 28.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 10.777777777777779, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.219961166381836, + "kl": 0.011513480450958014, + "learning_rate": 1.7429999999999999e-06, + "loss": -0.2091, + "num_tokens": 181430.0, + "reward": 0.0, + "reward_std": 1.0, + "rewards/reward_combined/mean": 0.0, + "rewards/reward_combined/std": 1.0, + "step": 582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 10.796296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05167926847934723, + "kl": 0.0028135766624473035, + "learning_rate": 1.7459999999999999e-06, + "loss": 0.0001, + "num_tokens": 181736.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.25, + "completions/mean_terminated_length": 6.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 10.814814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07724424451589584, + "kl": 0.0018440705025568604, + "learning_rate": 1.749e-06, + "loss": 0.0001, + "num_tokens": 181969.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 10.833333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07867051661014557, + "kl": 0.0016956999897956848, + "learning_rate": 1.752e-06, + "loss": 0.0001, + "num_tokens": 182185.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 10.851851851851851, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.244733810424805, + "kl": 0.003998629283159971, + "learning_rate": 1.755e-06, + "loss": -0.0022, + "num_tokens": 182484.0, + "reward": 5.25, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 2.598076105117798, + "step": 586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 10.87037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1518163681030273, + "kl": 0.0006980647303862497, + "learning_rate": 1.758e-06, + "loss": -0.0085, + "num_tokens": 182815.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 10.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010196794755756855, + "kl": 0.0003098830402450403, + "learning_rate": 1.761e-06, + "loss": 0.0, + "num_tokens": 183124.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 10.907407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.224768161773682, + "kl": 0.013283525127917528, + "learning_rate": 1.764e-06, + "loss": 0.3444, + "num_tokens": 183425.0, + "reward": 5.0, + "reward_std": 3.674234628677368, + "rewards/reward_combined/mean": 5.0, + "rewards/reward_combined/std": 3.674234628677368, + "step": 589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 10.925925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5237748622894287, + "kl": 0.21871189028024673, + "learning_rate": 1.767e-06, + "loss": 0.0574, + "num_tokens": 183687.0, + "reward": 4.0, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 4.041451930999756, + "step": 590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 33.0, + "completions/mean_terminated_length": 33.0, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 10.944444444444445, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8249011039733887, + "kl": 0.002103843493387103, + "learning_rate": 1.77e-06, + "loss": -0.0624, + "num_tokens": 184099.0, + "reward": 0.800000011920929, + "reward_std": 0.9626352787017822, + "rewards/reward_combined/mean": 0.800000011920929, + "rewards/reward_combined/std": 0.9626352787017822, + "step": 591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 10.962962962962964, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.345606327056885, + "kl": 0.1526459683664143, + "learning_rate": 1.773e-06, + "loss": -0.0739, + "num_tokens": 184397.0, + "reward": 2.5, + "reward_std": 4.3011627197265625, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 4.3011627197265625, + "step": 592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 10.981481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04271714389324188, + "kl": 0.001882628130260855, + "learning_rate": 1.776e-06, + "loss": 0.0001, + "num_tokens": 184668.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 11.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.5479326248168945, + "kl": 0.01541160186752677, + "learning_rate": 1.779e-06, + "loss": 0.0024, + "num_tokens": 184956.0, + "reward": 4.0, + "reward_std": 2.915475845336914, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 2.915475845336914, + "step": 594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 11.018518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008236559107899666, + "kl": 0.00152616947889328, + "learning_rate": 1.782e-06, + "loss": 0.0001, + "num_tokens": 185192.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 11.037037037037036, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.302337646484375, + "kl": 0.0047858242760412395, + "learning_rate": 1.785e-06, + "loss": -0.0431, + "num_tokens": 185487.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 11.055555555555555, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033569321036338806, + "kl": 0.0007271227514138445, + "learning_rate": 1.7879999999999999e-06, + "loss": 0.0, + "num_tokens": 185721.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 11.074074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07298684120178223, + "kl": 0.0019379467121325433, + "learning_rate": 1.7909999999999999e-06, + "loss": 0.0001, + "num_tokens": 185975.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 11.092592592592593, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13940539956092834, + "kl": 0.007875355251599103, + "learning_rate": 1.7939999999999999e-06, + "loss": 0.0004, + "num_tokens": 186245.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 599 + }, + { + "clip_ratio/high_max": 0.011363636702299118, + "clip_ratio/high_mean": 0.011363636702299118, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.011363636702299118, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 11.11111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.558523178100586, + "kl": 0.007870005210861564, + "learning_rate": 1.797e-06, + "loss": -0.1084, + "num_tokens": 186561.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 30.25, + "completions/mean_terminated_length": 30.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 11.12962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3574469089508057, + "kl": 0.0032532837940379977, + "learning_rate": 1.8e-06, + "loss": 0.1086, + "num_tokens": 186918.0, + "reward": 3.75, + "reward_std": 2.723355770111084, + "rewards/reward_combined/mean": 3.75, + "rewards/reward_combined/std": 2.723355770111084, + "step": 601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 11.148148148148149, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04779445007443428, + "kl": 0.003014355548657477, + "learning_rate": 1.803e-06, + "loss": 0.0002, + "num_tokens": 187212.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 11.166666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.747562289237976, + "kl": 0.009586491622030735, + "learning_rate": 1.806e-06, + "loss": -0.0665, + "num_tokens": 187550.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 7.5, + "completions/mean_terminated_length": 7.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 11.185185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06118657439947128, + "kl": 0.0015967967920005322, + "learning_rate": 1.809e-06, + "loss": 0.0001, + "num_tokens": 187780.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 11.203703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004736695904284716, + "kl": 0.00037567691470030695, + "learning_rate": 1.812e-06, + "loss": 0.0, + "num_tokens": 188144.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 11.222222222222221, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.6015238761901855, + "kl": 0.0037239082157611847, + "learning_rate": 1.815e-06, + "loss": 0.0981, + "num_tokens": 188453.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 29.0, + "completions/mean_terminated_length": 29.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 11.24074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6835286617279053, + "kl": 0.006729712942615151, + "learning_rate": 1.818e-06, + "loss": 0.0942, + "num_tokens": 188797.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 11.25925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.394619941711426, + "kl": 0.005084015661850572, + "learning_rate": 1.821e-06, + "loss": 0.0127, + "num_tokens": 189074.0, + "reward": 4.375, + "reward_std": 3.902456521987915, + "rewards/reward_combined/mean": 4.375, + "rewards/reward_combined/std": 3.902456521987915, + "step": 608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.008474576286971569, + "clip_ratio/low_min": 0.008474576286971569, + "clip_ratio/region_mean": 0.008474576286971569, + "completion_length": 29.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 29.25, + "completions/mean_terminated_length": 29.25, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 11.277777777777779, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.031157493591309, + "kl": 0.0069244245532900095, + "learning_rate": 1.824e-06, + "loss": 0.1142, + "num_tokens": 189407.0, + "reward": 1.75, + "reward_std": 2.1794495582580566, + "rewards/reward_combined/mean": 1.75, + "rewards/reward_combined/std": 2.1794495582580566, + "step": 609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 11.296296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14288270473480225, + "kl": 0.007235784083604813, + "learning_rate": 1.827e-06, + "loss": 0.0004, + "num_tokens": 189705.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 11.314814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042842842638492584, + "kl": 0.0003638714551925659, + "learning_rate": 1.83e-06, + "loss": 0.0, + "num_tokens": 189917.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 11.333333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.656713485717773, + "kl": 0.00391710945405066, + "learning_rate": 1.833e-06, + "loss": -0.1864, + "num_tokens": 190207.0, + "reward": 7.625, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.625, + "rewards/reward_combined/std": 0.25, + "step": 612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 11.351851851851851, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1138060986995697, + "kl": 0.01024311501532793, + "learning_rate": 1.836e-06, + "loss": 0.0005, + "num_tokens": 190503.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 11.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03635697811841965, + "kl": 0.001109056938730646, + "learning_rate": 1.839e-06, + "loss": 0.0001, + "num_tokens": 190832.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 11.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01670781336724758, + "kl": 0.002560785796958953, + "learning_rate": 1.8420000000000001e-06, + "loss": 0.0001, + "num_tokens": 191102.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 615 + }, + { + "clip_ratio/high_max": 0.01666666753590107, + "clip_ratio/high_mean": 0.01666666753590107, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01666666753590107, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 11.407407407407407, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.373130798339844, + "kl": 0.00798106868751347, + "learning_rate": 1.8450000000000001e-06, + "loss": -0.268, + "num_tokens": 191424.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 11.425925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00027208359097130597, + "kl": 8.612871170043945e-06, + "learning_rate": 1.848e-06, + "loss": 0.0, + "num_tokens": 191644.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 11.444444444444445, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0567723885178566, + "kl": 0.004504364216700196, + "learning_rate": 1.851e-06, + "loss": 0.0002, + "num_tokens": 191946.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 11.462962962962964, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007303483784198761, + "kl": 0.015217685140669346, + "learning_rate": 1.854e-06, + "loss": 0.0008, + "num_tokens": 192206.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 11.481481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1344587653875351, + "kl": 0.00376090407371521, + "learning_rate": 1.857e-06, + "loss": 0.0002, + "num_tokens": 192422.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 11.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7340598106384277, + "kl": 0.001701810397207737, + "learning_rate": 1.86e-06, + "loss": 0.0405, + "num_tokens": 192751.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 11.518518518518519, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.458225250244141, + "kl": 0.004642652929760516, + "learning_rate": 1.863e-06, + "loss": -0.0358, + "num_tokens": 193027.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 11.537037037037036, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03666996210813522, + "kl": 0.0018023437005467713, + "learning_rate": 1.866e-06, + "loss": 0.0001, + "num_tokens": 193301.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 11.555555555555555, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014777244068682194, + "kl": 8.708939640200697e-05, + "learning_rate": 1.869e-06, + "loss": 0.0, + "num_tokens": 193609.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 11.574074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.767086982727051, + "kl": 0.004180784453637898, + "learning_rate": 1.872e-06, + "loss": 0.153, + "num_tokens": 193948.0, + "reward": 5.25, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 4.5, + "step": 625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 11.592592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05402153730392456, + "kl": 0.0068692793138325214, + "learning_rate": 1.875e-06, + "loss": 0.0003, + "num_tokens": 194245.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 11.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0920700952410698, + "kl": 0.0013527125120162964, + "learning_rate": 1.878e-06, + "loss": 0.0001, + "num_tokens": 194455.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 11.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006711984518915415, + "kl": 0.0010253533837385476, + "learning_rate": 1.881e-06, + "loss": 0.0001, + "num_tokens": 194715.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 31.5, + "completions/mean_terminated_length": 31.5, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 11.648148148148149, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.964512825012207, + "kl": 0.009294234216213226, + "learning_rate": 1.884e-06, + "loss": -0.0983, + "num_tokens": 195069.0, + "reward": 2.625, + "reward_std": 1.4361406564712524, + "rewards/reward_combined/mean": 2.625, + "rewards/reward_combined/std": 1.4361406564712524, + "step": 629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 11.666666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.334890842437744, + "kl": 0.0058948209043592215, + "learning_rate": 1.887e-06, + "loss": 0.0057, + "num_tokens": 195379.0, + "reward": 5.375, + "reward_std": 2.462214469909668, + "rewards/reward_combined/mean": 5.375, + "rewards/reward_combined/std": 2.462214469909668, + "step": 630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 42.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 42.75, + "completions/mean_terminated_length": 42.75, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 11.685185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0751858800649643, + "kl": 0.008289248682558537, + "learning_rate": 1.8900000000000001e-06, + "loss": 0.0004, + "num_tokens": 195774.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 11.703703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11111831665039062, + "kl": 0.0027627437375485897, + "learning_rate": 1.8930000000000001e-06, + "loss": 0.0001, + "num_tokens": 195993.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 11.722222222222221, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.696523427963257, + "kl": 0.004287190269678831, + "learning_rate": 1.8960000000000001e-06, + "loss": -0.0525, + "num_tokens": 196302.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 11.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10204889625310898, + "kl": 0.017444612458348274, + "learning_rate": 1.8990000000000002e-06, + "loss": 0.0009, + "num_tokens": 196594.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 11.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004702435340732336, + "kl": 0.00022815167903900146, + "learning_rate": 1.9020000000000002e-06, + "loss": 0.0, + "num_tokens": 196838.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 11.777777777777779, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06758100539445877, + "kl": 0.004599316511303186, + "learning_rate": 1.905e-06, + "loss": 0.0002, + "num_tokens": 197147.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 11.796296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.605614185333252, + "kl": 0.0018278235220350325, + "learning_rate": 1.908e-06, + "loss": 0.1203, + "num_tokens": 197435.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 11.814814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.515329837799072, + "kl": 0.03643876314163208, + "learning_rate": 1.911e-06, + "loss": 0.1542, + "num_tokens": 197745.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 11.833333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05337267369031906, + "kl": 0.001255527138710022, + "learning_rate": 1.9140000000000002e-06, + "loss": 0.0001, + "num_tokens": 198005.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 37.25, + "completions/mean_terminated_length": 37.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 11.851851851851851, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2822935581207275, + "kl": 0.0013061455101706088, + "learning_rate": 1.917e-06, + "loss": -0.0226, + "num_tokens": 198434.0, + "reward": 1.2999999523162842, + "reward_std": 1.536229133605957, + "rewards/reward_combined/mean": 1.2999999523162842, + "rewards/reward_combined/std": 1.536229133605957, + "step": 640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.25, + "completions/mean_terminated_length": 12.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 11.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060271814465522766, + "kl": 0.0039077382534742355, + "learning_rate": 1.9200000000000003e-06, + "loss": 0.0002, + "num_tokens": 198715.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 11.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007071491796523333, + "kl": 0.00014022439427208155, + "learning_rate": 1.923e-06, + "loss": 0.0, + "num_tokens": 198971.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 11.907407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.723827838897705, + "kl": 0.03742720186710358, + "learning_rate": 1.9260000000000003e-06, + "loss": 0.2653, + "num_tokens": 199258.0, + "reward": 5.75, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 2.598076105117798, + "step": 643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 11.925925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.672297239303589, + "kl": 0.0019956419564550743, + "learning_rate": 1.929e-06, + "loss": 0.0335, + "num_tokens": 199540.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 11.944444444444445, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.243220806121826, + "kl": 0.01041489327326417, + "learning_rate": 1.9320000000000003e-06, + "loss": 0.0625, + "num_tokens": 199863.0, + "reward": 2.375, + "reward_std": 3.75, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 3.75, + "step": 645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 11.962962962962964, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09201733767986298, + "kl": 0.01014986983500421, + "learning_rate": 1.935e-06, + "loss": 0.0005, + "num_tokens": 200185.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 11.981481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05975935608148575, + "kl": 0.006647712318226695, + "learning_rate": 1.938e-06, + "loss": 0.0003, + "num_tokens": 200518.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 12.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04419155418872833, + "kl": 0.0016164439875865355, + "learning_rate": 1.941e-06, + "loss": 0.0001, + "num_tokens": 200782.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 12.018518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03825339302420616, + "kl": 0.0024186375085264444, + "learning_rate": 1.944e-06, + "loss": 0.0001, + "num_tokens": 201079.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 12.037037037037036, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2707661986351013, + "kl": 0.012258023954927921, + "learning_rate": 1.947e-06, + "loss": 0.0006, + "num_tokens": 201337.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 12.055555555555555, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10657081753015518, + "kl": 0.005779258208349347, + "learning_rate": 1.95e-06, + "loss": 0.0003, + "num_tokens": 201635.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 12.074074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.175602912902832, + "kl": 0.00963138323277235, + "learning_rate": 1.953e-06, + "loss": 0.0012, + "num_tokens": 201916.0, + "reward": 4.5, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 4.041451930999756, + "step": 652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 12.092592592592593, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007182702422142029, + "kl": 0.015305588487535715, + "learning_rate": 1.956e-06, + "loss": 0.0008, + "num_tokens": 202176.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 12.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030040180310606956, + "kl": 0.0013423172640614212, + "learning_rate": 1.9590000000000002e-06, + "loss": 0.0001, + "num_tokens": 202450.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 12.12962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.954211235046387, + "kl": 0.008251628605648875, + "learning_rate": 1.962e-06, + "loss": -0.0366, + "num_tokens": 202732.0, + "reward": 7.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.875, + "rewards/reward_combined/std": 0.25, + "step": 655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 27.0, + "completions/mean_terminated_length": 27.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 12.148148148148149, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9378139972686768, + "kl": 0.007347192615270615, + "learning_rate": 1.9650000000000002e-06, + "loss": -0.2469, + "num_tokens": 203092.0, + "reward": 5.875, + "reward_std": 3.5910768508911133, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 3.5910770893096924, + "step": 656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 12.166666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04142593964934349, + "kl": 0.0014567188918590546, + "learning_rate": 1.968e-06, + "loss": 0.0001, + "num_tokens": 203352.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 12.185185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0030019355472177267, + "kl": 0.00010986030247295275, + "learning_rate": 1.9710000000000003e-06, + "loss": 0.0, + "num_tokens": 203620.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 12.203703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001762113068252802, + "kl": 6.105055217631161e-05, + "learning_rate": 1.974e-06, + "loss": 0.0, + "num_tokens": 203931.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 12.222222222222221, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.4471435546875, + "kl": 0.024239951744675636, + "learning_rate": 1.9770000000000003e-06, + "loss": 0.0567, + "num_tokens": 204168.0, + "reward": 3.375, + "reward_std": 1.25, + "rewards/reward_combined/mean": 3.375, + "rewards/reward_combined/std": 1.25, + "step": 660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 12.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006828906014561653, + "kl": 0.0007608592859469354, + "learning_rate": 1.98e-06, + "loss": 0.0, + "num_tokens": 204428.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 12.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0069746170192956924, + "kl": 0.00021936596021987498, + "learning_rate": 1.9830000000000003e-06, + "loss": 0.0, + "num_tokens": 204747.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 12.277777777777779, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13971294462680817, + "kl": 0.007924544624984264, + "learning_rate": 1.986e-06, + "loss": 0.0004, + "num_tokens": 205047.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 12.296296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.632049560546875, + "kl": 0.007800285937264562, + "learning_rate": 1.9890000000000004e-06, + "loss": -0.0547, + "num_tokens": 205363.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 12.314814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013179701752960682, + "kl": 0.002018794766627252, + "learning_rate": 1.992e-06, + "loss": 0.0001, + "num_tokens": 205645.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 12.333333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3997981548309326, + "kl": 0.003044098149985075, + "learning_rate": 1.995e-06, + "loss": 0.0592, + "num_tokens": 205979.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 12.351851851851851, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01910579577088356, + "kl": 0.002425070386379957, + "learning_rate": 1.998e-06, + "loss": 0.0001, + "num_tokens": 206249.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 12.37037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.639466285705566, + "kl": 0.020511489361524582, + "learning_rate": 2.001e-06, + "loss": 0.0638, + "num_tokens": 206593.0, + "reward": 2.125, + "reward_std": 3.9449334144592285, + "rewards/reward_combined/mean": 2.125, + "rewards/reward_combined/std": 3.9449334144592285, + "step": 668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 12.38888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.862105369567871, + "kl": 0.03912351280450821, + "learning_rate": 2.004e-06, + "loss": 0.1246, + "num_tokens": 206900.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 34.75, + "completions/mean_terminated_length": 34.75, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 12.407407407407407, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.168433427810669, + "kl": 0.002303434011992067, + "learning_rate": 2.007e-06, + "loss": -0.0629, + "num_tokens": 207319.0, + "reward": 0.675000011920929, + "reward_std": 1.0436315536499023, + "rewards/reward_combined/mean": 0.675000011920929, + "rewards/reward_combined/std": 1.0436315536499023, + "step": 670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 12.425925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.924370527267456, + "kl": 0.00353796174749732, + "learning_rate": 2.0100000000000002e-06, + "loss": 0.0053, + "num_tokens": 207621.0, + "reward": 4.125, + "reward_std": 3.902456521987915, + "rewards/reward_combined/mean": 4.125, + "rewards/reward_combined/std": 3.902456521987915, + "step": 671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 12.444444444444445, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1171138286590576, + "kl": 0.0020666478958446532, + "learning_rate": 2.013e-06, + "loss": -0.0387, + "num_tokens": 207989.0, + "reward": 1.625, + "reward_std": 1.25, + "rewards/reward_combined/mean": 1.625, + "rewards/reward_combined/std": 1.25, + "step": 672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 12.462962962962964, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6609792709350586, + "kl": 0.7828765045851469, + "learning_rate": 2.0160000000000003e-06, + "loss": 0.083, + "num_tokens": 208275.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 2.5, + "completions/mean_terminated_length": 2.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 12.481481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05333259701728821, + "kl": 0.0006788596510887146, + "learning_rate": 2.019e-06, + "loss": 0.0, + "num_tokens": 208481.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 12.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12892459332942963, + "kl": 0.014746975619345903, + "learning_rate": 2.0220000000000003e-06, + "loss": 0.0007, + "num_tokens": 208749.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 12.518518518518519, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.480861663818359, + "kl": 0.015723032876849174, + "learning_rate": 2.025e-06, + "loss": 0.0323, + "num_tokens": 209049.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 12.537037037037036, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6830129623413086, + "kl": 0.0030975337140262127, + "learning_rate": 2.0280000000000003e-06, + "loss": 0.0279, + "num_tokens": 209378.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 75.0, + "completions/max_terminated_length": 75.0, + "completions/mean_length": 29.25, + "completions/mean_terminated_length": 29.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 12.555555555555555, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05692094564437866, + "kl": 0.0045005188876530156, + "learning_rate": 2.031e-06, + "loss": 0.0004, + "num_tokens": 209719.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 12.574074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011465778516139835, + "kl": 2.5704503059387207e-06, + "learning_rate": 2.0340000000000003e-06, + "loss": 0.0, + "num_tokens": 209939.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 12.592592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06668251752853394, + "kl": 0.008849663892760873, + "learning_rate": 2.037e-06, + "loss": 0.0005, + "num_tokens": 210277.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 12.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18250074982643127, + "kl": 0.007448920048773289, + "learning_rate": 2.0400000000000004e-06, + "loss": 0.0004, + "num_tokens": 210504.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.007042253389954567, + "clip_ratio/low_min": 0.007042253389954567, + "clip_ratio/region_mean": 0.007042253389954567, + "completion_length": 32.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 32.75, + "completions/mean_terminated_length": 32.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 12.62962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.01829195022583, + "kl": 0.013138486538082361, + "learning_rate": 2.043e-06, + "loss": -0.1311, + "num_tokens": 210859.0, + "reward": 2.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 2.25, + "rewards/reward_combined/std": 3.5, + "step": 682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 27.0, + "completions/mean_terminated_length": 27.0, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 12.648148148148149, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4903883934020996, + "kl": 0.011826877947896719, + "learning_rate": 2.0460000000000004e-06, + "loss": 0.0819, + "num_tokens": 211183.0, + "reward": 4.625, + "reward_std": 2.25, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 2.25, + "step": 683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 28.75, + "completions/mean_terminated_length": 28.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 12.666666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.724998950958252, + "kl": 0.01222726097330451, + "learning_rate": 2.049e-06, + "loss": 0.0461, + "num_tokens": 211526.0, + "reward": 5.0, + "reward_std": 3.5590262413024902, + "rewards/reward_combined/mean": 5.0, + "rewards/reward_combined/std": 3.5590262413024902, + "step": 684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 26.75, + "completions/mean_terminated_length": 26.75, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 12.685185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.051443099975586, + "kl": 0.015238044783473015, + "learning_rate": 2.052e-06, + "loss": 0.0276, + "num_tokens": 211869.0, + "reward": 2.75, + "reward_std": 1.1902379989624023, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.190238118171692, + "step": 685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 12.703703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032885242253541946, + "kl": 0.0011593550152610987, + "learning_rate": 2.0550000000000002e-06, + "loss": 0.0001, + "num_tokens": 212137.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 12.722222222222221, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014226168394088745, + "kl": 0.0033940672874450684, + "learning_rate": 2.058e-06, + "loss": 0.0002, + "num_tokens": 212417.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 12.74074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.164578914642334, + "kl": 0.007152646780014038, + "learning_rate": 2.0610000000000003e-06, + "loss": 0.0035, + "num_tokens": 212753.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 12.75925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5273241996765137, + "kl": 0.014231080407625996, + "learning_rate": 2.064e-06, + "loss": -0.1156, + "num_tokens": 213069.0, + "reward": 2.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 3.5, + "step": 689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 12.777777777777779, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01314019225537777, + "kl": 0.00022470951080322266, + "learning_rate": 2.067e-06, + "loss": 0.0, + "num_tokens": 213281.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 5.25, + "completions/mean_terminated_length": 5.25, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 12.796296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.443144798278809, + "kl": 0.0004062642801727634, + "learning_rate": 2.07e-06, + "loss": 0.0468, + "num_tokens": 213502.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 12.814814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03304954990744591, + "kl": 0.00044177046220283955, + "learning_rate": 2.073e-06, + "loss": 0.0, + "num_tokens": 213736.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 12.833333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01379492785781622, + "kl": 0.0003537073644110933, + "learning_rate": 2.0759999999999997e-06, + "loss": 0.0, + "num_tokens": 213992.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 12.851851851851851, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03360677883028984, + "kl": 0.0028873877599835396, + "learning_rate": 2.079e-06, + "loss": 0.0001, + "num_tokens": 214304.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 12.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05105031281709671, + "kl": 0.001215046620927751, + "learning_rate": 2.0819999999999997e-06, + "loss": 0.0001, + "num_tokens": 214564.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 695 + }, + { + "clip_ratio/high_max": 0.010638297535479069, + "clip_ratio/high_mean": 0.010638297535479069, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.010638297535479069, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 12.88888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.598762512207031, + "kl": 0.012793307425454259, + "learning_rate": 2.085e-06, + "loss": 0.1309, + "num_tokens": 214877.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 27.5, + "completions/mean_terminated_length": 27.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 12.907407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21235185861587524, + "kl": 0.029944309033453465, + "learning_rate": 2.0879999999999997e-06, + "loss": 0.0011, + "num_tokens": 215207.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 8.75, + "completions/mean_terminated_length": 8.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 12.925925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3441944420337677, + "kl": 0.0203075148165226, + "learning_rate": 2.091e-06, + "loss": 0.0012, + "num_tokens": 215454.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 12.944444444444445, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.609631538391113, + "kl": 0.03404449298977852, + "learning_rate": 2.0939999999999998e-06, + "loss": -0.109, + "num_tokens": 215744.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 12.962962962962964, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032854482531547546, + "kl": 0.0029180049896240234, + "learning_rate": 2.097e-06, + "loss": 0.0001, + "num_tokens": 215956.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 29.0, + "completions/mean_terminated_length": 29.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 12.981481481481481, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.977245807647705, + "kl": 0.013467305339872837, + "learning_rate": 2.1e-06, + "loss": 0.0012, + "num_tokens": 216300.0, + "reward": 3.0, + "reward_std": 3.535533905029297, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 3.535533905029297, + "step": 701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 13.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.03212833404541, + "kl": 0.008037997176870704, + "learning_rate": 2.103e-06, + "loss": -0.098, + "num_tokens": 216612.0, + "reward": 4.0, + "reward_std": 2.915475845336914, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 2.915475845336914, + "step": 702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 13.018518518518519, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.9668593406677246, + "kl": 0.011052212677896023, + "learning_rate": 2.106e-06, + "loss": 0.0269, + "num_tokens": 216928.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 13.037037037037036, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031044969335198402, + "kl": 0.003113462822511792, + "learning_rate": 2.109e-06, + "loss": 0.0002, + "num_tokens": 217260.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 62.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 166.0, + "completions/max_terminated_length": 166.0, + "completions/mean_length": 62.0, + "completions/mean_terminated_length": 62.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 13.055555555555555, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.012850284576416, + "kl": 0.005787973990663886, + "learning_rate": 2.112e-06, + "loss": 0.3668, + "num_tokens": 217728.0, + "reward": 0.5, + "reward_std": 1.0, + "rewards/reward_combined/mean": 0.5, + "rewards/reward_combined/std": 1.0, + "step": 705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 65.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 176.0, + "completions/max_terminated_length": 176.0, + "completions/mean_length": 65.5, + "completions/mean_terminated_length": 65.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 13.074074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.181117296218872, + "kl": 0.013391206972301006, + "learning_rate": 2.115e-06, + "loss": 0.3524, + "num_tokens": 218242.0, + "reward": -0.07500004768371582, + "reward_std": 3.053277015686035, + "rewards/reward_combined/mean": -0.07500004768371582, + "rewards/reward_combined/std": 3.053277015686035, + "step": 706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 13.092592592592593, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07046175748109818, + "kl": 0.02439271006733179, + "learning_rate": 2.118e-06, + "loss": 0.0012, + "num_tokens": 218542.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 13.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1288159340620041, + "kl": 0.0032868816051632166, + "learning_rate": 2.121e-06, + "loss": 0.0002, + "num_tokens": 218806.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 13.12962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7185447216033936, + "kl": 0.017199629452079535, + "learning_rate": 2.124e-06, + "loss": 0.0385, + "num_tokens": 219094.0, + "reward": 5.0, + "reward_std": 3.5590262413024902, + "rewards/reward_combined/mean": 5.0, + "rewards/reward_combined/std": 3.5590262413024902, + "step": 709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 13.148148148148149, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03423300385475159, + "kl": 0.004782620584592223, + "learning_rate": 2.127e-06, + "loss": 0.0002, + "num_tokens": 219368.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 13.166666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13810978829860687, + "kl": 0.012777527328580618, + "learning_rate": 2.13e-06, + "loss": 0.0007, + "num_tokens": 219642.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 13.185185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0054832505993545055, + "kl": 0.00024031996872508898, + "learning_rate": 2.133e-06, + "loss": 0.0, + "num_tokens": 219902.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 13.203703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04271876811981201, + "kl": 0.0008862614631652832, + "learning_rate": 2.136e-06, + "loss": 0.0, + "num_tokens": 220110.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 13.222222222222221, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028154635801911354, + "kl": 0.0006726061401423067, + "learning_rate": 2.1389999999999998e-06, + "loss": 0.0, + "num_tokens": 220378.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 13.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005581853445619345, + "kl": 0.0025982260704040527, + "learning_rate": 2.142e-06, + "loss": 0.0001, + "num_tokens": 220614.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 13.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07566168159246445, + "kl": 0.0019389942463021725, + "learning_rate": 2.145e-06, + "loss": 0.0001, + "num_tokens": 220870.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 13.277777777777779, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005097101908177137, + "kl": 0.0002322739819646813, + "learning_rate": 2.148e-06, + "loss": 0.0, + "num_tokens": 221186.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 32.0, + "completions/mean_terminated_length": 32.0, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 13.296296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.316486358642578, + "kl": 0.01027616742067039, + "learning_rate": 2.151e-06, + "loss": 0.0341, + "num_tokens": 221542.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 36.0, + "completions/mean_terminated_length": 36.0, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 13.314814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16810569167137146, + "kl": 0.019446187652647495, + "learning_rate": 2.154e-06, + "loss": 0.001, + "num_tokens": 221910.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 13.333333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.1822686195373535, + "kl": 0.0010762370002339594, + "learning_rate": 2.157e-06, + "loss": -0.0719, + "num_tokens": 222221.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 13.351851851851851, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017959382385015488, + "kl": 0.015388155821710825, + "learning_rate": 2.16e-06, + "loss": 0.0008, + "num_tokens": 222505.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 13.37037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.2283525466918945, + "kl": 0.005473986966535449, + "learning_rate": 2.163e-06, + "loss": -0.185, + "num_tokens": 222783.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 13.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01147428434342146, + "kl": 0.0003103732888121158, + "learning_rate": 2.166e-06, + "loss": 0.0, + "num_tokens": 223043.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 28.5, + "completions/mean_terminated_length": 28.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 13.407407407407407, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5182278156280518, + "kl": 0.007994799176231027, + "learning_rate": 2.169e-06, + "loss": 0.1362, + "num_tokens": 223377.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 13.425925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15986865758895874, + "kl": 0.02012438978999853, + "learning_rate": 2.172e-06, + "loss": 0.001, + "num_tokens": 223672.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 13.444444444444445, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0555325485765934, + "kl": 0.0026182486035395414, + "learning_rate": 2.175e-06, + "loss": 0.0001, + "num_tokens": 223970.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 13.462962962962964, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002697804942727089, + "kl": 9.336024595540948e-05, + "learning_rate": 2.178e-06, + "loss": 0.0, + "num_tokens": 224189.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 27.0, + "completions/mean_terminated_length": 27.0, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 13.481481481481481, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.931563854217529, + "kl": 0.012667708564549685, + "learning_rate": 2.181e-06, + "loss": 0.056, + "num_tokens": 224513.0, + "reward": 1.625, + "reward_std": 2.3935678005218506, + "rewards/reward_combined/mean": 1.625, + "rewards/reward_combined/std": 2.3935678005218506, + "step": 728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 13.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12529534101486206, + "kl": 0.004890830256044865, + "learning_rate": 2.184e-06, + "loss": 0.0002, + "num_tokens": 224839.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 13.518518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06014099717140198, + "kl": 0.006085360422730446, + "learning_rate": 2.187e-06, + "loss": 0.0003, + "num_tokens": 225172.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 13.537037037037036, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011584106832742691, + "kl": 0.0003792308270931244, + "learning_rate": 2.19e-06, + "loss": 0.0, + "num_tokens": 225432.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 13.555555555555555, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0322616845369339, + "kl": 0.0035367043456062675, + "learning_rate": 2.193e-06, + "loss": 0.0002, + "num_tokens": 225714.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 13.574074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08717934042215347, + "kl": 0.004229001933708787, + "learning_rate": 2.196e-06, + "loss": 0.0002, + "num_tokens": 226018.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 13.592592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 12.425130844116211, + "kl": 0.007111624465323985, + "learning_rate": 2.199e-06, + "loss": -0.0011, + "num_tokens": 226288.0, + "reward": 4.5, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 4.041451930999756, + "step": 734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 13.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041111864149570465, + "kl": 0.003297999035567045, + "learning_rate": 2.202e-06, + "loss": 0.0001, + "num_tokens": 226611.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 45.0, + "completions/max_terminated_length": 45.0, + "completions/mean_length": 24.75, + "completions/mean_terminated_length": 24.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 13.62962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.724043607711792, + "kl": 0.0074762695003300905, + "learning_rate": 2.205e-06, + "loss": -0.0028, + "num_tokens": 226930.0, + "reward": 4.25, + "reward_std": 3.752776622772217, + "rewards/reward_combined/mean": 4.25, + "rewards/reward_combined/std": 3.752776861190796, + "step": 736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 27.0, + "completions/mean_terminated_length": 27.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 13.648148148148149, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7709622383117676, + "kl": 0.04449963755905628, + "learning_rate": 2.208e-06, + "loss": 0.2254, + "num_tokens": 227278.0, + "reward": 4.0, + "reward_std": 4.690415859222412, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 4.690415859222412, + "step": 737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 13.666666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8305257558822632, + "kl": 0.0004713718080893159, + "learning_rate": 2.211e-06, + "loss": -0.0214, + "num_tokens": 227644.0, + "reward": 1.625, + "reward_std": 1.25, + "rewards/reward_combined/mean": 1.625, + "rewards/reward_combined/std": 1.25, + "step": 738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 12.25, + "completions/mean_terminated_length": 12.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 13.685185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.893324375152588, + "kl": 0.005538077675737441, + "learning_rate": 2.214e-06, + "loss": 0.0954, + "num_tokens": 227917.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 739 + }, + { + "clip_ratio/high_max": 0.006493506487458944, + "clip_ratio/high_mean": 0.006493506487458944, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006493506487458944, + "completion_length": 35.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 35.0, + "completions/mean_terminated_length": 35.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 13.703703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.699036121368408, + "kl": 0.01296887407079339, + "learning_rate": 2.217e-06, + "loss": -0.0991, + "num_tokens": 228285.0, + "reward": 2.875, + "reward_std": 2.3935678005218506, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 2.3935678005218506, + "step": 740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 13.722222222222221, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.624296188354492, + "kl": 0.019901788793504238, + "learning_rate": 2.22e-06, + "loss": 0.0198, + "num_tokens": 228546.0, + "reward": 7.0, + "reward_std": 1.0, + "rewards/reward_combined/mean": 7.0, + "rewards/reward_combined/std": 1.0, + "step": 741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 13.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03395688533782959, + "kl": 0.015439601615071297, + "learning_rate": 2.223e-06, + "loss": 0.0008, + "num_tokens": 228843.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 13.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04887813702225685, + "kl": 0.0015638243203284219, + "learning_rate": 2.226e-06, + "loss": 0.0001, + "num_tokens": 229077.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 13.777777777777779, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07996886223554611, + "kl": 0.007422972586937249, + "learning_rate": 2.229e-06, + "loss": 0.0004, + "num_tokens": 229361.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 13.796296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10441425442695618, + "kl": 0.008619187399744987, + "learning_rate": 2.232e-06, + "loss": 0.0004, + "num_tokens": 229654.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 7.25, + "completions/mean_terminated_length": 7.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 13.814814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 12.263534545898438, + "kl": 0.00315009499900043, + "learning_rate": 2.235e-06, + "loss": 0.3098, + "num_tokens": 229883.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 28.5, + "completions/mean_terminated_length": 28.5, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 13.833333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.224886894226074, + "kl": 0.003548555658198893, + "learning_rate": 2.238e-06, + "loss": -0.0285, + "num_tokens": 230233.0, + "reward": 1.5, + "reward_std": 2.2730302810668945, + "rewards/reward_combined/mean": 1.5, + "rewards/reward_combined/std": 2.2730302810668945, + "step": 747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 56.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 161.0, + "completions/max_terminated_length": 161.0, + "completions/mean_length": 56.25, + "completions/mean_terminated_length": 56.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 13.851851851851851, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4481959342956543, + "kl": 0.02331713866442442, + "learning_rate": 2.2410000000000002e-06, + "loss": 0.2235, + "num_tokens": 230686.0, + "reward": 3.5, + "reward_std": 3.674234628677368, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 3.674234628677368, + "step": 748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 13.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015618541510775685, + "kl": 2.5704503059387207e-06, + "learning_rate": 2.244e-06, + "loss": 0.0, + "num_tokens": 230906.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 35.0, + "completions/mean_terminated_length": 35.0, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 13.88888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6655162572860718, + "kl": 0.003726708237081766, + "learning_rate": 2.2470000000000003e-06, + "loss": -0.0406, + "num_tokens": 231326.0, + "reward": 1.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 1.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 13.907407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.533583641052246, + "kl": 0.03191567026078701, + "learning_rate": 2.25e-06, + "loss": -0.0356, + "num_tokens": 231621.0, + "reward": 5.25, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 2.598076105117798, + "step": 751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 13.925925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010266447439789772, + "kl": 0.014777874108403921, + "learning_rate": 2.253e-06, + "loss": 0.0007, + "num_tokens": 231881.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 13.944444444444445, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044206928461790085, + "kl": 0.004948943882482126, + "learning_rate": 2.256e-06, + "loss": 0.0002, + "num_tokens": 232176.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 13.962962962962964, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004614643286913633, + "kl": 0.00020968914031982422, + "learning_rate": 2.259e-06, + "loss": 0.0, + "num_tokens": 232420.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.5, + "completions/mean_terminated_length": 4.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 13.981481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12243378907442093, + "kl": 0.0037477342411875725, + "learning_rate": 2.262e-06, + "loss": 0.0002, + "num_tokens": 232634.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 14.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030719023197889328, + "kl": 0.0016948133707046509, + "learning_rate": 2.265e-06, + "loss": 0.0001, + "num_tokens": 232846.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 14.018518518518519, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0183265209198, + "kl": 0.005362946190871298, + "learning_rate": 2.268e-06, + "loss": 0.0126, + "num_tokens": 233180.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 23.25, + "completions/mean_terminated_length": 23.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 14.037037037037036, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.9046735763549805, + "kl": 0.025655806064605713, + "learning_rate": 2.271e-06, + "loss": 0.0606, + "num_tokens": 233525.0, + "reward": 4.625, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 14.055555555555555, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005016519222408533, + "kl": 0.002717442810535431, + "learning_rate": 2.274e-06, + "loss": 0.0001, + "num_tokens": 233761.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 14.074074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.308925151824951, + "kl": 0.002663616935024038, + "learning_rate": 2.277e-06, + "loss": 0.0167, + "num_tokens": 234076.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 14.092592592592593, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.024909019470215, + "kl": 0.7515916135162115, + "learning_rate": 2.28e-06, + "loss": -0.0305, + "num_tokens": 234370.0, + "reward": 5.75, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 4.5, + "step": 761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 14.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08666858077049255, + "kl": 0.0051762983202934265, + "learning_rate": 2.283e-06, + "loss": 0.0003, + "num_tokens": 234630.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 14.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029259422793984413, + "kl": 0.002253405749797821, + "learning_rate": 2.2860000000000002e-06, + "loss": 0.0001, + "num_tokens": 234944.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 33.25, + "completions/mean_terminated_length": 33.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 14.148148148148149, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6150531768798828, + "kl": 0.005932063329964876, + "learning_rate": 2.289e-06, + "loss": 0.0327, + "num_tokens": 235357.0, + "reward": 0.875, + "reward_std": 1.4361406564712524, + "rewards/reward_combined/mean": 0.875, + "rewards/reward_combined/std": 1.4361406564712524, + "step": 764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 28.25, + "completions/mean_terminated_length": 28.25, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 14.166666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.03399658203125, + "kl": 0.015308836940675974, + "learning_rate": 2.2920000000000002e-06, + "loss": -0.0414, + "num_tokens": 235706.0, + "reward": 3.25, + "reward_std": 0.28867512941360474, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 0.28867512941360474, + "step": 765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 14.185185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.235726356506348, + "kl": 0.018397110048681498, + "learning_rate": 2.295e-06, + "loss": -0.0134, + "num_tokens": 236039.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 14.203703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026918374001979828, + "kl": 0.001450765848858282, + "learning_rate": 2.2980000000000003e-06, + "loss": 0.0001, + "num_tokens": 236301.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 14.222222222222221, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.462699890136719, + "kl": 0.007106927805580199, + "learning_rate": 2.301e-06, + "loss": -0.0021, + "num_tokens": 236625.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.75, + "completions/mean_terminated_length": 11.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 14.24074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8901407718658447, + "kl": 0.0013421766343526542, + "learning_rate": 2.3040000000000003e-06, + "loss": -0.0006, + "num_tokens": 236900.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 14.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03274410590529442, + "kl": 0.002124100923538208, + "learning_rate": 2.307e-06, + "loss": 0.0001, + "num_tokens": 237112.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 7.0, + "completions/mean_terminated_length": 7.0, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 14.277777777777779, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014366337098181248, + "kl": 0.00023959364625625312, + "learning_rate": 2.31e-06, + "loss": 0.0, + "num_tokens": 237348.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 28.75, + "completions/mean_terminated_length": 28.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 14.296296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.9902851581573486, + "kl": 0.035965283401310444, + "learning_rate": 2.313e-06, + "loss": -0.0236, + "num_tokens": 237691.0, + "reward": 4.125, + "reward_std": 2.25, + "rewards/reward_combined/mean": 4.125, + "rewards/reward_combined/std": 2.25, + "step": 772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 14.314814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04854524880647659, + "kl": 0.0013521099463105202, + "learning_rate": 2.316e-06, + "loss": 0.0001, + "num_tokens": 237948.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 14.333333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.973568916320801, + "kl": 0.0028942684293724597, + "learning_rate": 2.319e-06, + "loss": 0.0326, + "num_tokens": 238230.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 30.25, + "completions/mean_terminated_length": 30.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 14.351851851851851, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.36252498626709, + "kl": 0.04083090089261532, + "learning_rate": 2.322e-06, + "loss": 0.3214, + "num_tokens": 238591.0, + "reward": 4.375, + "reward_std": 4.190763473510742, + "rewards/reward_combined/mean": 4.375, + "rewards/reward_combined/std": 4.190763473510742, + "step": 775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 14.37037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.025457859039307, + "kl": 0.0012301181559450924, + "learning_rate": 2.325e-06, + "loss": 0.1834, + "num_tokens": 238874.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 14.38888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.516607761383057, + "kl": 0.06527543067932129, + "learning_rate": 2.328e-06, + "loss": 0.1089, + "num_tokens": 239186.0, + "reward": 4.625, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 14.407407407407407, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022976483160164207, + "kl": 5.662441253662109e-06, + "learning_rate": 2.3310000000000002e-06, + "loss": 0.0, + "num_tokens": 239406.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 14.425925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.25756582617759705, + "kl": 0.03808259125798941, + "learning_rate": 2.334e-06, + "loss": 0.002, + "num_tokens": 239709.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 14.444444444444445, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.000672817230225, + "kl": 0.011569038964807987, + "learning_rate": 2.3370000000000002e-06, + "loss": 0.0643, + "num_tokens": 239997.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 780 + }, + { + "clip_ratio/high_max": 0.01984127052128315, + "clip_ratio/high_mean": 0.01984127052128315, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01984127052128315, + "completion_length": 26.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 26.25, + "completions/mean_terminated_length": 26.25, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 14.462962962962964, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.125977993011475, + "kl": 0.02409778255969286, + "learning_rate": 2.34e-06, + "loss": -0.335, + "num_tokens": 240318.0, + "reward": 3.375, + "reward_std": 0.25, + "rewards/reward_combined/mean": 3.375, + "rewards/reward_combined/std": 0.25, + "step": 781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 14.481481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15390489995479584, + "kl": 0.01383195398375392, + "learning_rate": 2.3430000000000003e-06, + "loss": 0.0007, + "num_tokens": 240634.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 14.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1950475573539734, + "kl": 0.011463714996352792, + "learning_rate": 2.346e-06, + "loss": 0.0006, + "num_tokens": 240902.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 14.518518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01879250258207321, + "kl": 0.002030833507888019, + "learning_rate": 2.3490000000000003e-06, + "loss": 0.0001, + "num_tokens": 241268.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 14.537037037037036, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00866797473281622, + "kl": 0.00024850977933965623, + "learning_rate": 2.352e-06, + "loss": 0.0, + "num_tokens": 241585.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 14.555555555555555, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029151448979973793, + "kl": 0.0009359948744531721, + "learning_rate": 2.3550000000000003e-06, + "loss": 0.0, + "num_tokens": 241892.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 14.574074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09562128037214279, + "kl": 0.007301156176254153, + "learning_rate": 2.358e-06, + "loss": 0.0004, + "num_tokens": 242177.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 14.592592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.764527797698975, + "kl": 0.04302992485463619, + "learning_rate": 2.3610000000000003e-06, + "loss": -0.1232, + "num_tokens": 242472.0, + "reward": 3.875, + "reward_std": 2.9545164108276367, + "rewards/reward_combined/mean": 3.875, + "rewards/reward_combined/std": 2.9545164108276367, + "step": 788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 14.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00547375250607729, + "kl": 0.00022152662131702527, + "learning_rate": 2.364e-06, + "loss": 0.0, + "num_tokens": 242692.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 28.25, + "completions/mean_terminated_length": 28.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 14.62962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.084255218505859, + "kl": 0.019813910126686096, + "learning_rate": 2.367e-06, + "loss": 0.1207, + "num_tokens": 243025.0, + "reward": 4.125, + "reward_std": 4.308422088623047, + "rewards/reward_combined/mean": 4.125, + "rewards/reward_combined/std": 4.308422088623047, + "step": 790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 14.648148148148149, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.163844347000122, + "kl": 0.01603899523615837, + "learning_rate": 2.37e-06, + "loss": -0.0168, + "num_tokens": 243351.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 32.25, + "completions/mean_terminated_length": 32.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 14.666666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1018881797790527, + "kl": 0.017605592496693134, + "learning_rate": 2.373e-06, + "loss": -0.0646, + "num_tokens": 243704.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 14.685185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01398887112736702, + "kl": 0.001736477017402649, + "learning_rate": 2.376e-06, + "loss": 0.0001, + "num_tokens": 243920.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 14.703703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.382302761077881, + "kl": 0.04818444326519966, + "learning_rate": 2.379e-06, + "loss": -0.0153, + "num_tokens": 244215.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 14.722222222222221, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.991127967834473, + "kl": 0.014985023532062769, + "learning_rate": 2.3820000000000002e-06, + "loss": 0.1454, + "num_tokens": 244483.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 14.74074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.7530293464660645, + "kl": 0.010870927944779396, + "learning_rate": 2.385e-06, + "loss": 0.1486, + "num_tokens": 244785.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 14.75925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.771587371826172, + "kl": 0.004894593148492277, + "learning_rate": 2.3880000000000003e-06, + "loss": 0.021, + "num_tokens": 245070.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 14.777777777777779, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.494588375091553, + "kl": 0.008534710621461272, + "learning_rate": 2.391e-06, + "loss": -0.1524, + "num_tokens": 245352.0, + "reward": 7.625, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.625, + "rewards/reward_combined/std": 0.25, + "step": 798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 14.796296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005379894282668829, + "kl": 0.01579880900681019, + "learning_rate": 2.3940000000000003e-06, + "loss": 0.0008, + "num_tokens": 245612.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.75, + "completions/mean_terminated_length": 11.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 14.814814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.056868553161621, + "kl": 0.008523806929588318, + "learning_rate": 2.397e-06, + "loss": 0.0612, + "num_tokens": 245911.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 8.25, + "completions/mean_terminated_length": 8.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 14.833333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14603033661842346, + "kl": 0.004562627989798784, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.0002, + "num_tokens": 246156.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 68.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 68.25, + "completions/mean_terminated_length": 68.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 14.851851851851851, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5938432216644287, + "kl": 0.011430806946009398, + "learning_rate": 2.403e-06, + "loss": 0.46, + "num_tokens": 246649.0, + "reward": 2.799999952316284, + "reward_std": 1.399999976158142, + "rewards/reward_combined/mean": 2.799999952316284, + "rewards/reward_combined/std": 1.399999976158142, + "step": 802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 14.87037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.2324957847595215, + "kl": 0.004533653263933957, + "learning_rate": 2.4060000000000003e-06, + "loss": -0.0011, + "num_tokens": 246941.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 14.88888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.289831161499023, + "kl": 0.008422995451837778, + "learning_rate": 2.409e-06, + "loss": -0.0143, + "num_tokens": 247212.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 14.907407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07687978446483612, + "kl": 0.007713424973189831, + "learning_rate": 2.4120000000000004e-06, + "loss": 0.0004, + "num_tokens": 247537.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 14.925925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07869924604892731, + "kl": 0.0005496889352798462, + "learning_rate": 2.415e-06, + "loss": 0.0, + "num_tokens": 247749.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 14.944444444444445, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.808557987213135, + "kl": 0.023773484863340855, + "learning_rate": 2.4180000000000004e-06, + "loss": 0.1667, + "num_tokens": 248027.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 14.962962962962964, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019696783274412155, + "kl": 0.0003880545264109969, + "learning_rate": 2.421e-06, + "loss": 0.0, + "num_tokens": 248295.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 14.981481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5050677061080933, + "kl": 0.02722766832448542, + "learning_rate": 2.4240000000000004e-06, + "loss": 0.0012, + "num_tokens": 248571.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 15.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17297464609146118, + "kl": 0.006345292087644339, + "learning_rate": 2.4270000000000002e-06, + "loss": 0.0004, + "num_tokens": 248781.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 15.018518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012793388217687607, + "kl": 0.0004118494689464569, + "learning_rate": 2.43e-06, + "loss": 0.0, + "num_tokens": 249041.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 15.037037037037036, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05514936149120331, + "kl": 0.005671827122569084, + "learning_rate": 2.4330000000000003e-06, + "loss": 0.0003, + "num_tokens": 249353.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 15.055555555555555, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10728318989276886, + "kl": 0.01710322964936495, + "learning_rate": 2.436e-06, + "loss": 0.0009, + "num_tokens": 249679.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 15.074074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004523104056715965, + "kl": 0.015971110202372074, + "learning_rate": 2.439e-06, + "loss": 0.0008, + "num_tokens": 249939.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 30.0, + "completions/mean_terminated_length": 30.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 15.092592592592593, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3868942260742188, + "kl": 0.02833088766783476, + "learning_rate": 2.442e-06, + "loss": -0.0053, + "num_tokens": 250283.0, + "reward": 4.125, + "reward_std": 3.902456521987915, + "rewards/reward_combined/mean": 4.125, + "rewards/reward_combined/std": 3.902456521987915, + "step": 815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 15.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006653910502791405, + "kl": 0.00028151705919299275, + "learning_rate": 2.445e-06, + "loss": 0.0, + "num_tokens": 250601.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 15.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08972105383872986, + "kl": 0.0036098076961934566, + "learning_rate": 2.448e-06, + "loss": 0.0002, + "num_tokens": 250866.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 46.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 87.0, + "completions/max_terminated_length": 87.0, + "completions/mean_length": 46.75, + "completions/mean_terminated_length": 46.75, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 15.148148148148149, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.391178131103516, + "kl": 0.018015262205153704, + "learning_rate": 2.451e-06, + "loss": 0.0543, + "num_tokens": 251269.0, + "reward": 4.125, + "reward_std": 2.75, + "rewards/reward_combined/mean": 4.125, + "rewards/reward_combined/std": 2.75, + "step": 818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 15.166666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3129059672355652, + "kl": 0.030778750777244568, + "learning_rate": 2.4539999999999997e-06, + "loss": 0.0015, + "num_tokens": 251481.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 15.185185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 20.031448364257812, + "kl": 0.008709351997822523, + "learning_rate": 2.457e-06, + "loss": -0.2137, + "num_tokens": 251697.0, + "reward": 3.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 3.875, + "rewards/reward_combined/std": 0.25, + "step": 820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 15.203703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.34016168117523193, + "kl": 0.013878948986530304, + "learning_rate": 2.4599999999999997e-06, + "loss": 0.0007, + "num_tokens": 251913.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 24.25, + "completions/mean_terminated_length": 24.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 15.222222222222221, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0805835723876953, + "kl": 0.025030162185430527, + "learning_rate": 2.463e-06, + "loss": 0.011, + "num_tokens": 252278.0, + "reward": 2.25, + "reward_std": 1.443375587463379, + "rewards/reward_combined/mean": 2.25, + "rewards/reward_combined/std": 1.4433757066726685, + "step": 822 + }, + { + "clip_ratio/high_max": 0.006849315017461777, + "clip_ratio/high_mean": 0.006849315017461777, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006849315017461777, + "completion_length": 33.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 33.25, + "completions/mean_terminated_length": 33.25, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 15.24074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.046453833580017, + "kl": 0.011136360466480255, + "learning_rate": 2.4659999999999998e-06, + "loss": 0.0495, + "num_tokens": 252691.0, + "reward": 0.375, + "reward_std": 0.25, + "rewards/reward_combined/mean": 0.375, + "rewards/reward_combined/std": 0.25, + "step": 823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 15.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005311328452080488, + "kl": 0.0026903748512268066, + "learning_rate": 2.469e-06, + "loss": 0.0001, + "num_tokens": 252927.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 15.277777777777779, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12365634739398956, + "kl": 0.006539266090840101, + "learning_rate": 2.472e-06, + "loss": 0.0003, + "num_tokens": 253223.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 15.296296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.3430609703063965, + "kl": 0.01864812895655632, + "learning_rate": 2.475e-06, + "loss": -0.0173, + "num_tokens": 253498.0, + "reward": 4.5, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 4.041451930999756, + "step": 826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 15.314814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00028615814517252147, + "kl": 8.106231689453125e-06, + "learning_rate": 2.478e-06, + "loss": 0.0, + "num_tokens": 253718.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 827 + }, + { + "clip_ratio/high_max": 0.012500000186264515, + "clip_ratio/high_mean": 0.012500000186264515, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.012500000186264515, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 15.333333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.083314895629883, + "kl": 0.03541209362447262, + "learning_rate": 2.481e-06, + "loss": -0.0688, + "num_tokens": 254024.0, + "reward": 3.375, + "reward_std": 2.9545164108276367, + "rewards/reward_combined/mean": 3.375, + "rewards/reward_combined/std": 2.9545164108276367, + "step": 828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 15.351851851851851, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010798932984471321, + "kl": 0.01656962465494871, + "learning_rate": 2.484e-06, + "loss": 0.0008, + "num_tokens": 254308.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 88.25, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 46.0, + "completions/mean_length": 88.25, + "completions/mean_terminated_length": 32.333335876464844, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 15.37037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8000051975250244, + "kl": 0.012468930799514055, + "learning_rate": 2.487e-06, + "loss": 0.4189, + "num_tokens": 254885.0, + "reward": 1.6749999523162842, + "reward_std": 4.925021171569824, + "rewards/reward_combined/mean": 1.6749999523162842, + "rewards/reward_combined/std": 4.925021648406982, + "step": 830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 15.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04856877028942108, + "kl": 0.006245983298867941, + "learning_rate": 2.49e-06, + "loss": 0.0003, + "num_tokens": 255219.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 15.407407407407407, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021499263122677803, + "kl": 0.0004824884235858917, + "learning_rate": 2.493e-06, + "loss": 0.0, + "num_tokens": 255463.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 15.425925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002227855147793889, + "kl": 0.0009088899241760373, + "learning_rate": 2.496e-06, + "loss": 0.0, + "num_tokens": 255743.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.75, + "completions/mean_terminated_length": 10.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 15.444444444444445, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1993919461965561, + "kl": 0.011588844936341047, + "learning_rate": 2.499e-06, + "loss": 0.0006, + "num_tokens": 256014.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 70.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 70.25, + "completions/mean_terminated_length": 70.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 15.462962962962964, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.380399227142334, + "kl": 0.015295200049877167, + "learning_rate": 2.502e-06, + "loss": -0.0819, + "num_tokens": 256515.0, + "reward": 2.875, + "reward_std": 3.3008837699890137, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 3.3008837699890137, + "step": 835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 15.481481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02020743303000927, + "kl": 0.0024401472182944417, + "learning_rate": 2.505e-06, + "loss": 0.0001, + "num_tokens": 256841.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 836 + }, + { + "clip_ratio/high_max": 0.01785714365541935, + "clip_ratio/high_mean": 0.01785714365541935, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01785714365541935, + "completion_length": 75.25, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 75.25, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 15.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8669129610061646, + "kl": 0.018591200932860374, + "learning_rate": 2.508e-06, + "loss": 0.4382, + "num_tokens": 257366.0, + "reward": 5.0, + "reward_std": 6.0, + "rewards/reward_combined/mean": 5.0, + "rewards/reward_combined/std": 6.0, + "step": 837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 23.25, + "completions/mean_terminated_length": 23.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 15.518518518518519, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.980868339538574, + "kl": 0.06596332974731922, + "learning_rate": 2.5109999999999998e-06, + "loss": 0.1504, + "num_tokens": 257679.0, + "reward": 4.875, + "reward_std": 3.1983067989349365, + "rewards/reward_combined/mean": 4.875, + "rewards/reward_combined/std": 3.1983067989349365, + "step": 838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 15.537037037037036, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.717414855957031, + "kl": 0.049179114401340485, + "learning_rate": 2.514e-06, + "loss": -0.0306, + "num_tokens": 258011.0, + "reward": 4.125, + "reward_std": 2.25, + "rewards/reward_combined/mean": 4.125, + "rewards/reward_combined/std": 2.25, + "step": 839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 15.555555555555555, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025592342019081116, + "kl": 0.0010536994668655097, + "learning_rate": 2.5169999999999998e-06, + "loss": 0.0001, + "num_tokens": 258291.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 27.75, + "completions/mean_terminated_length": 27.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 15.574074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1575191020965576, + "kl": 0.02428480051457882, + "learning_rate": 2.52e-06, + "loss": -0.0155, + "num_tokens": 258654.0, + "reward": 3.0, + "reward_std": 3.188521146774292, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 3.188521146774292, + "step": 841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 77.75, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 77.75, + "completions/mean_terminated_length": 18.33333396911621, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 15.592592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.552376747131348, + "kl": 0.13188587129116058, + "learning_rate": 2.523e-06, + "loss": 0.421, + "num_tokens": 259205.0, + "reward": 4.175000190734863, + "reward_std": 4.418427467346191, + "rewards/reward_combined/mean": 4.175000190734863, + "rewards/reward_combined/std": 4.418426990509033, + "step": 842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 15.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011714904569089413, + "kl": 0.0006037205748725682, + "learning_rate": 2.526e-06, + "loss": 0.0, + "num_tokens": 259465.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 15.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03280490264296532, + "kl": 0.0005672499537467957, + "learning_rate": 2.529e-06, + "loss": 0.0, + "num_tokens": 259721.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 75.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 75.0, + "completions/mean_terminated_length": 14.666666984558105, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 15.648148148148149, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0065717697143555, + "kl": 0.016092784702777863, + "learning_rate": 2.532e-06, + "loss": 0.4498, + "num_tokens": 260253.0, + "reward": 5.050000190734863, + "reward_std": 4.900000095367432, + "rewards/reward_combined/mean": 5.050000190734863, + "rewards/reward_combined/std": 4.900000095367432, + "step": 845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 15.666666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.742649555206299, + "kl": 0.030048758257180452, + "learning_rate": 2.535e-06, + "loss": 0.0033, + "num_tokens": 260551.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 15.685185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.368227005004883, + "kl": 0.023549416102468967, + "learning_rate": 2.538e-06, + "loss": -0.2378, + "num_tokens": 260842.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 28.75, + "completions/mean_terminated_length": 28.75, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 15.703703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.097448348999023, + "kl": 0.03640543203800917, + "learning_rate": 2.541e-06, + "loss": -0.0501, + "num_tokens": 261193.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 15.722222222222221, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1348850280046463, + "kl": 0.014819524250924587, + "learning_rate": 2.544e-06, + "loss": 0.0007, + "num_tokens": 261461.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 15.74074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.457876205444336, + "kl": 0.10042929649353027, + "learning_rate": 2.547e-06, + "loss": 0.0602, + "num_tokens": 261674.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 15.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18141864240169525, + "kl": 0.009819218306802213, + "learning_rate": 2.55e-06, + "loss": 0.0005, + "num_tokens": 261909.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 15.777777777777779, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01111546903848648, + "kl": 0.0038869944401085377, + "learning_rate": 2.553e-06, + "loss": 0.0002, + "num_tokens": 262179.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 15.796296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.720064640045166, + "kl": 0.005797249847091734, + "learning_rate": 2.556e-06, + "loss": 0.0021, + "num_tokens": 262469.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 15.814814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.499596118927002, + "kl": 0.019651985203381628, + "learning_rate": 2.559e-06, + "loss": 0.0151, + "num_tokens": 262782.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 15.833333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.40037572383880615, + "kl": 0.017680106684565544, + "learning_rate": 2.562e-06, + "loss": 0.0009, + "num_tokens": 263106.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 15.851851851851851, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.041896343231201, + "kl": 0.018168576061725616, + "learning_rate": 2.565e-06, + "loss": -0.0974, + "num_tokens": 263418.0, + "reward": 5.5, + "reward_std": 2.886751174926758, + "rewards/reward_combined/mean": 5.5, + "rewards/reward_combined/std": 2.886751413345337, + "step": 856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 15.87037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.9160261154174805, + "kl": 0.036556024104356766, + "learning_rate": 2.568e-06, + "loss": 0.011, + "num_tokens": 263751.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 15.88888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.762621283531189, + "kl": 0.018396658822894096, + "learning_rate": 2.571e-06, + "loss": 0.0008, + "num_tokens": 264043.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 15.907407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05802302062511444, + "kl": 0.0029262282769195735, + "learning_rate": 2.574e-06, + "loss": 0.0002, + "num_tokens": 264305.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 15.925925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.820932149887085, + "kl": 0.007205143105238676, + "learning_rate": 2.577e-06, + "loss": -0.0397, + "num_tokens": 264587.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 15.944444444444445, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008702805265784264, + "kl": 0.0010077431797981262, + "learning_rate": 2.58e-06, + "loss": 0.0001, + "num_tokens": 264795.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 15.962962962962964, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11642981320619583, + "kl": 0.0075371162965893745, + "learning_rate": 2.583e-06, + "loss": 0.0004, + "num_tokens": 265065.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 57.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 57.25, + "completions/mean_terminated_length": 57.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 15.981481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.389289915561676, + "kl": 0.026683930307626724, + "learning_rate": 2.586e-06, + "loss": 0.0015, + "num_tokens": 265514.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 16.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006408870220184326, + "kl": 9.412899089511484e-05, + "learning_rate": 2.589e-06, + "loss": 0.0, + "num_tokens": 265821.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 16.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14997774362564087, + "kl": 0.02597730467095971, + "learning_rate": 2.592e-06, + "loss": 0.0013, + "num_tokens": 266103.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 16.037037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08316528052091599, + "kl": 0.009424427058547735, + "learning_rate": 2.595e-06, + "loss": 0.0005, + "num_tokens": 266403.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 16.055555555555557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11386890709400177, + "kl": 0.01250389963388443, + "learning_rate": 2.598e-06, + "loss": 0.0006, + "num_tokens": 266639.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 16.074074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.8794735670089722, + "kl": 0.1778415720909834, + "learning_rate": 2.601e-06, + "loss": 0.0097, + "num_tokens": 266926.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 16.09259259259259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02523677609860897, + "kl": 0.0018860953277908266, + "learning_rate": 2.604e-06, + "loss": 0.0001, + "num_tokens": 267161.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 16.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04206731542944908, + "kl": 0.002634609234519303, + "learning_rate": 2.607e-06, + "loss": 0.0001, + "num_tokens": 267451.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 16.12962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.223171234130859, + "kl": 0.01811151672154665, + "learning_rate": 2.61e-06, + "loss": 0.0318, + "num_tokens": 267723.0, + "reward": 4.0, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 4.041451930999756, + "step": 871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 16.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08685275167226791, + "kl": 0.0033728512935340405, + "learning_rate": 2.6130000000000002e-06, + "loss": 0.0002, + "num_tokens": 267966.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 16.166666666666668, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013421605341136456, + "kl": 0.0006301686516962945, + "learning_rate": 2.616e-06, + "loss": 0.0, + "num_tokens": 268280.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 16.185185185185187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017196089029312134, + "kl": 0.0010637480881996453, + "learning_rate": 2.6190000000000003e-06, + "loss": 0.0001, + "num_tokens": 268560.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 874 + }, + { + "clip_ratio/high_max": 0.014285714365541935, + "clip_ratio/high_mean": 0.014285714365541935, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.014285714365541935, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 16.203703703703702, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.5222344398498535, + "kl": 0.08057743683457375, + "learning_rate": 2.622e-06, + "loss": 0.257, + "num_tokens": 268901.0, + "reward": 6.375, + "reward_std": 2.136000871658325, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.136000871658325, + "step": 875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 16.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050898998975753784, + "kl": 0.01073373481631279, + "learning_rate": 2.6250000000000003e-06, + "loss": 0.0005, + "num_tokens": 269233.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 16.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00040665382402949035, + "kl": 2.074986696243286e-05, + "learning_rate": 2.628e-06, + "loss": 0.0, + "num_tokens": 269453.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 26.25, + "completions/mean_terminated_length": 26.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 16.25925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1193506717681885, + "kl": 0.027223428711295128, + "learning_rate": 2.631e-06, + "loss": 0.0201, + "num_tokens": 269794.0, + "reward": 2.5, + "reward_std": 3.34165620803833, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 3.34165620803833, + "step": 878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 29.75, + "completions/mean_terminated_length": 29.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 16.27777777777778, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.142059803009033, + "kl": 0.09255321323871613, + "learning_rate": 2.634e-06, + "loss": 0.233, + "num_tokens": 270165.0, + "reward": 4.0, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 4.041451930999756, + "step": 879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 16.296296296296298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2199128121137619, + "kl": 0.006227896548807621, + "learning_rate": 2.637e-06, + "loss": 0.0005, + "num_tokens": 270392.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 16.314814814814813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009426550008356571, + "kl": 0.0012574732536450028, + "learning_rate": 2.64e-06, + "loss": 0.0001, + "num_tokens": 270612.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.01666666753590107, + "clip_ratio/low_min": 0.01666666753590107, + "clip_ratio/region_mean": 0.01666666753590107, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 16.333333333333332, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.762788772583008, + "kl": 0.059044482884928584, + "learning_rate": 2.643e-06, + "loss": 0.0679, + "num_tokens": 270888.0, + "reward": 5.875, + "reward_std": 3.25, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 3.25, + "step": 882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 16.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007317017298191786, + "kl": 0.0015296489582397044, + "learning_rate": 2.646e-06, + "loss": 0.0001, + "num_tokens": 271148.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 16.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006072989199310541, + "kl": 8.368250928469934e-05, + "learning_rate": 2.649e-06, + "loss": 0.0, + "num_tokens": 271455.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 28.0, + "completions/mean_terminated_length": 28.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 16.38888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.729745864868164, + "kl": 0.04546512849628925, + "learning_rate": 2.652e-06, + "loss": 0.1228, + "num_tokens": 271787.0, + "reward": 2.0, + "reward_std": 2.4494898319244385, + "rewards/reward_combined/mean": 2.0, + "rewards/reward_combined/std": 2.4494898319244385, + "step": 885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 16.40740740740741, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02424285188317299, + "kl": 0.0010086670517921448, + "learning_rate": 2.655e-06, + "loss": 0.0001, + "num_tokens": 271999.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 16.425925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06065645441412926, + "kl": 0.03292999789118767, + "learning_rate": 2.6580000000000002e-06, + "loss": 0.0016, + "num_tokens": 272299.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 35.5, + "completions/mean_terminated_length": 35.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 16.444444444444443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10115627944469452, + "kl": 0.018276115879416466, + "learning_rate": 2.661e-06, + "loss": 0.0009, + "num_tokens": 272665.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 16.462962962962962, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.175871849060059, + "kl": 0.013722633011639118, + "learning_rate": 2.6640000000000002e-06, + "loss": -0.0609, + "num_tokens": 272955.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 16.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08490806072950363, + "kl": 0.0038473325548693538, + "learning_rate": 2.667e-06, + "loss": 0.0002, + "num_tokens": 273227.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 16.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08408478647470474, + "kl": 0.014603359624743462, + "learning_rate": 2.6700000000000003e-06, + "loss": 0.0007, + "num_tokens": 273559.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 16.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03706580400466919, + "kl": 0.003242477774620056, + "learning_rate": 2.673e-06, + "loss": 0.0002, + "num_tokens": 273771.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 16.537037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11515523493289948, + "kl": 0.013682656921446323, + "learning_rate": 2.6760000000000003e-06, + "loss": 0.0007, + "num_tokens": 274053.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 16.555555555555557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0946512445807457, + "kl": 0.0027261764043942094, + "learning_rate": 2.679e-06, + "loss": 0.0001, + "num_tokens": 274309.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 16.574074074074073, + "frac_reward_zero_std": 0.0, + "grad_norm": 10.735785484313965, + "kl": 0.17699826508760452, + "learning_rate": 2.6820000000000003e-06, + "loss": 0.0939, + "num_tokens": 274586.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 16.59259259259259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027571633458137512, + "kl": 0.0007305496255867183, + "learning_rate": 2.685e-06, + "loss": 0.0, + "num_tokens": 274906.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 16.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.860959529876709, + "kl": 0.1105173472315073, + "learning_rate": 2.688e-06, + "loss": 0.0969, + "num_tokens": 275218.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 16.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04810880124568939, + "kl": 0.001539662480354309, + "learning_rate": 2.691e-06, + "loss": 0.0001, + "num_tokens": 275428.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 16.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03662363812327385, + "kl": 0.0009340256219729781, + "learning_rate": 2.694e-06, + "loss": 0.0, + "num_tokens": 275688.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 16.666666666666668, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4957729578018188, + "kl": 0.10789931565523148, + "learning_rate": 2.697e-06, + "loss": 0.0042, + "num_tokens": 276052.0, + "reward": 2.25, + "reward_std": 1.443375587463379, + "rewards/reward_combined/mean": 2.25, + "rewards/reward_combined/std": 1.4433757066726685, + "step": 900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 16.685185185185187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046485308557748795, + "kl": 0.0034694699570536613, + "learning_rate": 2.7e-06, + "loss": 0.0002, + "num_tokens": 276324.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 16.703703703703702, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00432240916416049, + "kl": 0.016069352626800537, + "learning_rate": 2.703e-06, + "loss": 0.0008, + "num_tokens": 276584.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 16.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06279173493385315, + "kl": 0.0014365874230861664, + "learning_rate": 2.706e-06, + "loss": 0.0001, + "num_tokens": 276844.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 16.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03251232951879501, + "kl": 0.019935129210352898, + "learning_rate": 2.7090000000000002e-06, + "loss": 0.001, + "num_tokens": 277140.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.007462686393409967, + "clip_ratio/low_min": 0.007462686393409967, + "clip_ratio/region_mean": 0.007462686393409967, + "completion_length": 34.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 34.25, + "completions/mean_terminated_length": 34.25, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 16.75925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.653429985046387, + "kl": 0.047437798231840134, + "learning_rate": 2.712e-06, + "loss": 0.1554, + "num_tokens": 277493.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 16.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010527708567678928, + "kl": 0.0014890655875205994, + "learning_rate": 2.7150000000000003e-06, + "loss": 0.0001, + "num_tokens": 277805.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 38.75, + "completions/mean_terminated_length": 38.75, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 16.796296296296298, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.460294008255005, + "kl": 0.0361027829349041, + "learning_rate": 2.718e-06, + "loss": -0.1019, + "num_tokens": 278240.0, + "reward": 2.174999952316284, + "reward_std": 1.1786290407180786, + "rewards/reward_combined/mean": 2.174999952316284, + "rewards/reward_combined/std": 1.1786291599273682, + "step": 907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 16.814814814814813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0483318492770195, + "kl": 0.00674024224281311, + "learning_rate": 2.7210000000000003e-06, + "loss": 0.0003, + "num_tokens": 278508.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 16.833333333333332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05405644699931145, + "kl": 0.0017870822339318693, + "learning_rate": 2.724e-06, + "loss": 0.0001, + "num_tokens": 278824.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 16.85185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.086075782775879, + "kl": 0.009135594591498375, + "learning_rate": 2.7270000000000003e-06, + "loss": 0.1567, + "num_tokens": 279173.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 69.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 218.0, + "completions/max_terminated_length": 218.0, + "completions/mean_length": 69.75, + "completions/mean_terminated_length": 69.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 16.87037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.284515142440796, + "kl": 0.03399188816547394, + "learning_rate": 2.73e-06, + "loss": 0.4709, + "num_tokens": 279672.0, + "reward": 3.625, + "reward_std": 5.202163219451904, + "rewards/reward_combined/mean": 3.625, + "rewards/reward_combined/std": 5.202163219451904, + "step": 911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 16.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07876795530319214, + "kl": 0.0020766069064848125, + "learning_rate": 2.7330000000000003e-06, + "loss": 0.0001, + "num_tokens": 279968.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 16.90740740740741, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.349666595458984, + "kl": 0.06186091527342796, + "learning_rate": 2.736e-06, + "loss": -0.001, + "num_tokens": 280296.0, + "reward": 4.125, + "reward_std": 3.902456521987915, + "rewards/reward_combined/mean": 4.125, + "rewards/reward_combined/std": 3.902456521987915, + "step": 913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 16.925925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14858929812908173, + "kl": 0.010949058923870325, + "learning_rate": 2.7390000000000004e-06, + "loss": 0.0005, + "num_tokens": 280566.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 16.944444444444443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01540637481957674, + "kl": 0.004458198556676507, + "learning_rate": 2.742e-06, + "loss": 0.0002, + "num_tokens": 280836.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 16.962962962962962, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.211705684661865, + "kl": 0.02147698076441884, + "learning_rate": 2.745e-06, + "loss": 0.0209, + "num_tokens": 281114.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 16.98148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.836970090866089, + "kl": 0.02658749930560589, + "learning_rate": 2.748e-06, + "loss": 0.0343, + "num_tokens": 281433.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 17.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.061885833740234, + "kl": 0.21568666584789753, + "learning_rate": 2.751e-06, + "loss": 0.0096, + "num_tokens": 281738.0, + "reward": 5.75, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 2.598076105117798, + "step": 918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 17.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024867020547389984, + "kl": 0.0009969845414161682, + "learning_rate": 2.7540000000000002e-06, + "loss": 0.0, + "num_tokens": 281950.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 17.037037037037038, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.72955322265625, + "kl": 0.014656886691227555, + "learning_rate": 2.757e-06, + "loss": 0.0055, + "num_tokens": 282220.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 17.055555555555557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009860399179160595, + "kl": 0.0010821044561453164, + "learning_rate": 2.7600000000000003e-06, + "loss": 0.0001, + "num_tokens": 282440.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 17.074074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20747016370296478, + "kl": 0.014619201421737671, + "learning_rate": 2.763e-06, + "loss": 0.0007, + "num_tokens": 282652.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 17.09259259259259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4475628137588501, + "kl": 0.05984331271611154, + "learning_rate": 2.7660000000000003e-06, + "loss": 0.0023, + "num_tokens": 282974.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 35.25, + "completions/mean_terminated_length": 35.25, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 17.11111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.306441783905029, + "kl": 0.03935196250677109, + "learning_rate": 2.769e-06, + "loss": 0.0715, + "num_tokens": 283339.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 28.25, + "completions/mean_terminated_length": 28.25, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 17.12962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.366795063018799, + "kl": 0.0751175731420517, + "learning_rate": 2.7720000000000003e-06, + "loss": -0.0527, + "num_tokens": 283680.0, + "reward": 6.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 6.5, + "rewards/reward_combined/std": 2.0, + "step": 925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 17.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02690128982067108, + "kl": 0.0009666383266448975, + "learning_rate": 2.775e-06, + "loss": 0.0, + "num_tokens": 283960.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 17.166666666666668, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04740017652511597, + "kl": 0.0076216175220906734, + "learning_rate": 2.7780000000000003e-06, + "loss": 0.0004, + "num_tokens": 284289.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 17.185185185185187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004154135473072529, + "kl": 0.016127178445458412, + "learning_rate": 2.781e-06, + "loss": 0.0008, + "num_tokens": 284549.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 17.203703703703702, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09007273614406586, + "kl": 0.017842161934822798, + "learning_rate": 2.7840000000000004e-06, + "loss": 0.0009, + "num_tokens": 284882.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 28.25, + "completions/mean_terminated_length": 28.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 17.22222222222222, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.6361002922058105, + "kl": 0.09756945073604584, + "learning_rate": 2.787e-06, + "loss": 0.0767, + "num_tokens": 285235.0, + "reward": 3.875, + "reward_std": 2.9545164108276367, + "rewards/reward_combined/mean": 3.875, + "rewards/reward_combined/std": 2.9545164108276367, + "step": 930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 17.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008572395890951157, + "kl": 0.0013220729306340218, + "learning_rate": 2.7900000000000004e-06, + "loss": 0.0001, + "num_tokens": 285547.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 17.25925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.750455856323242, + "kl": 0.12378092110157013, + "learning_rate": 2.793e-06, + "loss": 0.1037, + "num_tokens": 285854.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 17.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17185324430465698, + "kl": 0.02490962017327547, + "learning_rate": 2.7960000000000004e-06, + "loss": 0.0012, + "num_tokens": 286147.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 17.296296296296298, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.1884236335754395, + "kl": 0.002807863100315444, + "learning_rate": 2.7990000000000002e-06, + "loss": -0.0004, + "num_tokens": 286403.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 17.314814814814813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06539967656135559, + "kl": 0.005298139003571123, + "learning_rate": 2.802e-06, + "loss": 0.0003, + "num_tokens": 286733.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 17.333333333333332, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.5376152992248535, + "kl": 0.03906891401857138, + "learning_rate": 2.8050000000000002e-06, + "loss": 0.015, + "num_tokens": 287027.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 17.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018045363947749138, + "kl": 0.005035794340074062, + "learning_rate": 2.808e-06, + "loss": 0.0003, + "num_tokens": 287315.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 10.75, + "completions/mean_terminated_length": 10.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 17.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0250736474990845, + "kl": 0.07505254819989204, + "learning_rate": 2.8110000000000003e-06, + "loss": 0.0038, + "num_tokens": 287578.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 17.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020768439397215843, + "kl": 0.020604564808309078, + "learning_rate": 2.814e-06, + "loss": 0.001, + "num_tokens": 287870.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 32.5, + "completions/mean_terminated_length": 32.5, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 17.40740740740741, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.393409252166748, + "kl": 0.0839981846511364, + "learning_rate": 2.817e-06, + "loss": 0.1574, + "num_tokens": 288236.0, + "reward": 1.875, + "reward_std": 1.314977765083313, + "rewards/reward_combined/mean": 1.875, + "rewards/reward_combined/std": 1.3149778842926025, + "step": 940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.25, + "completions/mean_terminated_length": 12.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 17.425925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18137066066265106, + "kl": 0.00959052995312959, + "learning_rate": 2.82e-06, + "loss": 0.0005, + "num_tokens": 288497.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 17.444444444444443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05492454767227173, + "kl": 0.009323860984295607, + "learning_rate": 2.823e-06, + "loss": 0.0005, + "num_tokens": 288783.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 942 + }, + { + "clip_ratio/high_max": 0.0024271844886243343, + "clip_ratio/high_mean": 0.0024271844886243343, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0024271844886243343, + "completion_length": 66.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 66.25, + "completions/mean_terminated_length": 66.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 17.462962962962962, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7071022987365723, + "kl": 0.06320535112172365, + "learning_rate": 2.8259999999999997e-06, + "loss": 0.0383, + "num_tokens": 289264.0, + "reward": 2.0, + "reward_std": 1.471960186958313, + "rewards/reward_combined/mean": 2.0, + "rewards/reward_combined/std": 1.471960186958313, + "step": 943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 17.48148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.644867420196533, + "kl": 0.04936415143311024, + "learning_rate": 2.829e-06, + "loss": 0.0008, + "num_tokens": 289567.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 33.0, + "completions/mean_terminated_length": 33.0, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 17.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2899612188339233, + "kl": 0.02315691113471985, + "learning_rate": 2.8319999999999997e-06, + "loss": -0.0807, + "num_tokens": 289979.0, + "reward": 2.174999952316284, + "reward_std": 1.1786291599273682, + "rewards/reward_combined/mean": 2.174999952316284, + "rewards/reward_combined/std": 1.1786291599273682, + "step": 945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 17.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6830765008926392, + "kl": 0.06120620295405388, + "learning_rate": 2.835e-06, + "loss": 0.0034, + "num_tokens": 290261.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 17.537037037037038, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.678679466247559, + "kl": 0.030490998411551118, + "learning_rate": 2.8379999999999998e-06, + "loss": 0.099, + "num_tokens": 290537.0, + "reward": 5.75, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 2.598076105117798, + "step": 947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 17.555555555555557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06989409774541855, + "kl": 0.0022222733023227192, + "learning_rate": 2.841e-06, + "loss": 0.0001, + "num_tokens": 290772.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 17.574074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09559326618909836, + "kl": 0.0045561735751107335, + "learning_rate": 2.844e-06, + "loss": 0.0002, + "num_tokens": 291042.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 2.5, + "completions/mean_terminated_length": 2.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 17.59259259259259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11788101494312286, + "kl": 0.0019803866744041443, + "learning_rate": 2.847e-06, + "loss": 0.0001, + "num_tokens": 291248.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 17.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.919772148132324, + "kl": 0.005017139395931736, + "learning_rate": 2.85e-06, + "loss": 0.1366, + "num_tokens": 291518.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 17.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02758970484137535, + "kl": 0.0008264200441772118, + "learning_rate": 2.853e-06, + "loss": 0.0, + "num_tokens": 291824.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 17.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0027528919745236635, + "kl": 0.0007725166215095669, + "learning_rate": 2.856e-06, + "loss": 0.0, + "num_tokens": 292104.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 17.666666666666668, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015700655058026314, + "kl": 0.00035790354013442993, + "learning_rate": 2.859e-06, + "loss": 0.0, + "num_tokens": 292348.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 17.685185185185187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4215909242630005, + "kl": 0.05775163508951664, + "learning_rate": 2.862e-06, + "loss": 0.0031, + "num_tokens": 292670.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 23.25, + "completions/mean_terminated_length": 23.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 17.703703703703702, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.383664131164551, + "kl": 0.04355753492563963, + "learning_rate": 2.865e-06, + "loss": -0.1513, + "num_tokens": 293015.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 17.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007158689200878143, + "kl": 0.00010698745609261096, + "learning_rate": 2.868e-06, + "loss": 0.0, + "num_tokens": 293336.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 17.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10804415494203568, + "kl": 0.008159431832609698, + "learning_rate": 2.871e-06, + "loss": 0.0004, + "num_tokens": 293645.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 17.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1420525163412094, + "kl": 0.005328859901055694, + "learning_rate": 2.874e-06, + "loss": 0.0003, + "num_tokens": 293941.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 17.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02422979660332203, + "kl": 0.09467829018831253, + "learning_rate": 2.877e-06, + "loss": 0.0047, + "num_tokens": 294307.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 17.796296296296298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00691164331510663, + "kl": 0.0017841786611825228, + "learning_rate": 2.88e-06, + "loss": 0.0001, + "num_tokens": 294567.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 17.814814814814813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05318329483270645, + "kl": 0.03480059280991554, + "learning_rate": 2.883e-06, + "loss": 0.0017, + "num_tokens": 294867.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 17.833333333333332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004444839432835579, + "kl": 4.042685031890869e-05, + "learning_rate": 2.886e-06, + "loss": 0.0, + "num_tokens": 295087.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 17.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02944686822593212, + "kl": 0.0016392802353948355, + "learning_rate": 2.8889999999999998e-06, + "loss": 0.0001, + "num_tokens": 295314.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 17.87037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.5267839431762695, + "kl": 0.09300582110881805, + "learning_rate": 2.892e-06, + "loss": -0.0358, + "num_tokens": 295622.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 17.88888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 13.580138206481934, + "kl": 0.023242179304361343, + "learning_rate": 2.895e-06, + "loss": 0.1713, + "num_tokens": 295862.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 17.90740740740741, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.606494903564453, + "kl": 0.05697191320359707, + "learning_rate": 2.898e-06, + "loss": 0.0362, + "num_tokens": 296148.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 17.925925925925927, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.292166709899902, + "kl": 0.02170123066753149, + "learning_rate": 2.901e-06, + "loss": 0.0653, + "num_tokens": 296433.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 17.944444444444443, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.754703521728516, + "kl": 0.013701246120035648, + "learning_rate": 2.904e-06, + "loss": 0.0002, + "num_tokens": 296778.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 17.962962962962962, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.046321868896484, + "kl": 0.02073330502025783, + "learning_rate": 2.907e-06, + "loss": 0.0146, + "num_tokens": 297048.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 17.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040881071239709854, + "kl": 0.018811689253197983, + "learning_rate": 2.91e-06, + "loss": 0.0009, + "num_tokens": 297336.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 18.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15847693383693695, + "kl": 0.003819052129983902, + "learning_rate": 2.913e-06, + "loss": 0.0002, + "num_tokens": 297596.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 18.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04494069516658783, + "kl": 0.0062313086818903685, + "learning_rate": 2.916e-06, + "loss": 0.0003, + "num_tokens": 297854.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 37.75, + "completions/mean_terminated_length": 37.75, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 18.037037037037038, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.149745464324951, + "kl": 0.014793979469686747, + "learning_rate": 2.919e-06, + "loss": 0.0001, + "num_tokens": 298285.0, + "reward": 1.8499999046325684, + "reward_std": 0.9110432863235474, + "rewards/reward_combined/mean": 1.8499999046325684, + "rewards/reward_combined/std": 0.9110434055328369, + "step": 974 + }, + { + "clip_ratio/high_max": 0.017241379246115685, + "clip_ratio/high_mean": 0.017241379246115685, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.017241379246115685, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 18.055555555555557, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.219341278076172, + "kl": 0.041466670110821724, + "learning_rate": 2.922e-06, + "loss": 0.0819, + "num_tokens": 298566.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 18.074074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1388024240732193, + "kl": 0.010825720615684986, + "learning_rate": 2.925e-06, + "loss": 0.0005, + "num_tokens": 298822.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 18.09259259259259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036770425736904144, + "kl": 0.00320415198802948, + "learning_rate": 2.928e-06, + "loss": 0.0002, + "num_tokens": 299034.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 18.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16454218327999115, + "kl": 0.026634372770786285, + "learning_rate": 2.931e-06, + "loss": 0.0013, + "num_tokens": 299356.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 18.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0029778636526316404, + "kl": 0.016340223141014576, + "learning_rate": 2.934e-06, + "loss": 0.0008, + "num_tokens": 299616.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 18.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10779254138469696, + "kl": 0.0024613887071609497, + "learning_rate": 2.937e-06, + "loss": 0.0001, + "num_tokens": 299824.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 18.166666666666668, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00044605552102439106, + "kl": 4.427880048751831e-05, + "learning_rate": 2.9400000000000002e-06, + "loss": 0.0, + "num_tokens": 300044.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 18.185185185185187, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.850919246673584, + "kl": 0.014800236793234944, + "learning_rate": 2.943e-06, + "loss": 0.0149, + "num_tokens": 300322.0, + "reward": 6.125, + "reward_std": 3.75, + "rewards/reward_combined/mean": 6.125, + "rewards/reward_combined/std": 3.75, + "step": 982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 18.203703703703702, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.76835036277771, + "kl": 0.03857973590493202, + "learning_rate": 2.946e-06, + "loss": 0.02, + "num_tokens": 300635.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 18.22222222222222, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.330134868621826, + "kl": 0.0038978730226517655, + "learning_rate": 2.949e-06, + "loss": 0.0399, + "num_tokens": 300957.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 18.24074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.354736804962158, + "kl": 0.01612484361976385, + "learning_rate": 2.952e-06, + "loss": -0.0576, + "num_tokens": 301224.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 18.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8794246912002563, + "kl": 0.13801612704992294, + "learning_rate": 2.955e-06, + "loss": 0.0071, + "num_tokens": 301526.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 18.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0020747825037688017, + "kl": 0.00010592797480057925, + "learning_rate": 2.958e-06, + "loss": 0.0, + "num_tokens": 301834.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 28.5, + "completions/mean_terminated_length": 28.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 18.296296296296298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6713685393333435, + "kl": 0.1279423087835312, + "learning_rate": 2.961e-06, + "loss": 0.0061, + "num_tokens": 302168.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 23.25, + "completions/mean_terminated_length": 23.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 18.314814814814813, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.612220287322998, + "kl": 0.10975046455860138, + "learning_rate": 2.964e-06, + "loss": -0.0253, + "num_tokens": 302497.0, + "reward": 3.75, + "reward_std": 2.872281312942505, + "rewards/reward_combined/mean": 3.75, + "rewards/reward_combined/std": 2.872281312942505, + "step": 989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 18.333333333333332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0224091075360775, + "kl": 0.009532616473734379, + "learning_rate": 2.967e-06, + "loss": 0.0005, + "num_tokens": 302779.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 18.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03791513666510582, + "kl": 0.013638301170431077, + "learning_rate": 2.97e-06, + "loss": 0.0008, + "num_tokens": 303066.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 24.75, + "completions/mean_terminated_length": 24.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 18.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07310573011636734, + "kl": 0.01153097813948989, + "learning_rate": 2.973e-06, + "loss": 0.0006, + "num_tokens": 303409.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.013513513840734959, + "clip_ratio/low_min": 0.013513513840734959, + "clip_ratio/region_mean": 0.013513513840734959, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 18.38888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.885204792022705, + "kl": 0.2055332437157631, + "learning_rate": 2.976e-06, + "loss": -0.0392, + "num_tokens": 303708.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 18.40740740740741, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1347719132900238, + "kl": 0.005800263257697225, + "learning_rate": 2.979e-06, + "loss": 0.0003, + "num_tokens": 303942.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 18.425925925925927, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.487607002258301, + "kl": 0.013947858475148678, + "learning_rate": 2.982e-06, + "loss": -0.1081, + "num_tokens": 304266.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 18.444444444444443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02589021995663643, + "kl": 0.0011689886450767517, + "learning_rate": 2.9850000000000002e-06, + "loss": 0.0001, + "num_tokens": 304478.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 18.462962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15449932217597961, + "kl": 0.025850625708699226, + "learning_rate": 2.988e-06, + "loss": 0.0013, + "num_tokens": 304777.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 18.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11922191828489304, + "kl": 0.014185238629579544, + "learning_rate": 2.9910000000000002e-06, + "loss": 0.0007, + "num_tokens": 305116.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 35.25, + "completions/mean_terminated_length": 35.25, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 18.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.687262058258057, + "kl": 0.07127954624593258, + "learning_rate": 2.994e-06, + "loss": -0.0903, + "num_tokens": 305485.0, + "reward": 1.875, + "reward_std": 1.6007810831069946, + "rewards/reward_combined/mean": 1.875, + "rewards/reward_combined/std": 1.6007810831069946, + "step": 999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 24.75, + "completions/mean_terminated_length": 24.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 18.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.7668983936309814, + "kl": 0.2996699586510658, + "learning_rate": 2.9970000000000003e-06, + "loss": 0.0164, + "num_tokens": 305804.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 18.537037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010628429241478443, + "kl": 0.004377992358058691, + "learning_rate": 3e-06, + "loss": 0.0002, + "num_tokens": 306074.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 18.555555555555557, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9562952518463135, + "kl": 0.006490831729024649, + "learning_rate": 2.999666666666667e-06, + "loss": -0.0002, + "num_tokens": 306362.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 1002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 18.574074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20808644592761993, + "kl": 0.01005901675671339, + "learning_rate": 2.9993333333333332e-06, + "loss": 0.0006, + "num_tokens": 306630.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 18.59259259259259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14580221474170685, + "kl": 0.008147581713274121, + "learning_rate": 2.999e-06, + "loss": 0.0004, + "num_tokens": 306906.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 18.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13075172901153564, + "kl": 0.06699612364172935, + "learning_rate": 2.9986666666666668e-06, + "loss": 0.0033, + "num_tokens": 307174.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 18.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05798104405403137, + "kl": 0.0031279143877327442, + "learning_rate": 2.9983333333333336e-06, + "loss": 0.0002, + "num_tokens": 307436.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 18.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04234755411744118, + "kl": 0.0017250796081498265, + "learning_rate": 2.998e-06, + "loss": 0.0001, + "num_tokens": 307706.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 18.666666666666668, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02651769295334816, + "kl": 0.020571790635585785, + "learning_rate": 2.9976666666666667e-06, + "loss": 0.001, + "num_tokens": 308000.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 18.685185185185187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028096335008740425, + "kl": 0.09387857094407082, + "learning_rate": 2.997333333333333e-06, + "loss": 0.0047, + "num_tokens": 308364.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1009 + }, + { + "clip_ratio/high_max": 0.006849315017461777, + "clip_ratio/high_mean": 0.006849315017461777, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006849315017461777, + "completion_length": 29.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 29.25, + "completions/mean_terminated_length": 29.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 18.703703703703702, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.333775043487549, + "kl": 0.03270596917718649, + "learning_rate": 2.9970000000000003e-06, + "loss": 0.0563, + "num_tokens": 308733.0, + "reward": 5.875, + "reward_std": 3.5910768508911133, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 3.5910770893096924, + "step": 1010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 18.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015856772661209106, + "kl": 0.0004476197063922882, + "learning_rate": 2.996666666666667e-06, + "loss": 0.0, + "num_tokens": 308993.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 18.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2930438816547394, + "kl": 0.033293591812253, + "learning_rate": 2.9963333333333334e-06, + "loss": 0.0017, + "num_tokens": 309282.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 18.75925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.213979721069336, + "kl": 0.10456418618559837, + "learning_rate": 2.996e-06, + "loss": -0.0101, + "num_tokens": 309591.0, + "reward": 6.125, + "reward_std": 3.75, + "rewards/reward_combined/mean": 6.125, + "rewards/reward_combined/std": 3.75, + "step": 1013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 18.77777777777778, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.028774261474609, + "kl": 0.009066569808055647, + "learning_rate": 2.9956666666666666e-06, + "loss": -0.0029, + "num_tokens": 309851.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 1014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 18.796296296296298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1365141123533249, + "kl": 0.013530088821426034, + "learning_rate": 2.9953333333333333e-06, + "loss": 0.0006, + "num_tokens": 310162.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 33.5, + "completions/mean_terminated_length": 33.5, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 18.814814814814813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19325755536556244, + "kl": 0.05480775982141495, + "learning_rate": 2.995e-06, + "loss": 0.0027, + "num_tokens": 310520.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 18.833333333333332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11743105947971344, + "kl": 0.017631690949201584, + "learning_rate": 2.994666666666667e-06, + "loss": 0.0009, + "num_tokens": 310816.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 29.5, + "completions/mean_terminated_length": 29.5, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 18.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04771581292152405, + "kl": 0.01937988307327032, + "learning_rate": 2.9943333333333333e-06, + "loss": 0.001, + "num_tokens": 311150.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 18.87037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.864377021789551, + "kl": 0.03459107130765915, + "learning_rate": 2.994e-06, + "loss": 0.109, + "num_tokens": 311462.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 1019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 18.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010085365734994411, + "kl": 0.0010973572498187423, + "learning_rate": 2.993666666666667e-06, + "loss": 0.0001, + "num_tokens": 311682.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 18.90740740740741, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054485615342855453, + "kl": 0.008097039069980383, + "learning_rate": 2.993333333333333e-06, + "loss": 0.0004, + "num_tokens": 311956.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 18.925925925925927, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.920691013336182, + "kl": 0.10139943659305573, + "learning_rate": 2.993e-06, + "loss": 0.0205, + "num_tokens": 312241.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 1022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 18.944444444444443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0073060947470366955, + "kl": 0.0007110536098480225, + "learning_rate": 2.9926666666666668e-06, + "loss": 0.0, + "num_tokens": 312485.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1023 + }, + { + "clip_ratio/high_max": 0.014705882407724857, + "clip_ratio/high_mean": 0.014705882407724857, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.014705882407724857, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 18.962962962962962, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.789229154586792, + "kl": 0.05072168447077274, + "learning_rate": 2.9923333333333335e-06, + "loss": -0.0561, + "num_tokens": 312790.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 1024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 18.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0028168451972305775, + "kl": 0.003279261291027069, + "learning_rate": 2.992e-06, + "loss": 0.0002, + "num_tokens": 313026.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 19.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061545904725790024, + "kl": 0.0036889008479192853, + "learning_rate": 2.9916666666666667e-06, + "loss": 0.0002, + "num_tokens": 313360.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 19.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04298610985279083, + "kl": 0.0021496829576790333, + "learning_rate": 2.9913333333333335e-06, + "loss": 0.0001, + "num_tokens": 313622.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 19.037037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2620873749256134, + "kl": 0.036713266745209694, + "learning_rate": 2.9910000000000002e-06, + "loss": 0.0019, + "num_tokens": 313923.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 19.055555555555557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16502098739147186, + "kl": 0.05797074735164642, + "learning_rate": 2.990666666666667e-06, + "loss": 0.0029, + "num_tokens": 314191.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 19.074074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07259169965982437, + "kl": 0.01000349223613739, + "learning_rate": 2.9903333333333334e-06, + "loss": 0.0005, + "num_tokens": 314503.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 19.09259259259259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.545125424861908, + "kl": 0.05854676757007837, + "learning_rate": 2.99e-06, + "loss": 0.0028, + "num_tokens": 314772.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 24.75, + "completions/mean_terminated_length": 24.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 19.11111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6622626781463623, + "kl": 0.09639779478311539, + "learning_rate": 2.9896666666666665e-06, + "loss": -0.0059, + "num_tokens": 315139.0, + "reward": 4.625, + "reward_std": 2.25, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 2.25, + "step": 1032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 19.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6983280777931213, + "kl": 0.09250823222100735, + "learning_rate": 2.9893333333333333e-06, + "loss": 0.0045, + "num_tokens": 315430.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 29.5, + "completions/mean_terminated_length": 29.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 19.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059819430112838745, + "kl": 0.008328840602189302, + "learning_rate": 2.989e-06, + "loss": 0.0004, + "num_tokens": 315776.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 73.75, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 73.75, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 19.166666666666668, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.860781192779541, + "kl": 0.020166019443422556, + "learning_rate": 2.988666666666667e-06, + "loss": 0.4523, + "num_tokens": 316291.0, + "reward": 4.875, + "reward_std": 5.25, + "rewards/reward_combined/mean": 4.875, + "rewards/reward_combined/std": 5.25, + "step": 1035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 19.185185185185187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05397823452949524, + "kl": 0.005703333066776395, + "learning_rate": 2.9883333333333332e-06, + "loss": 0.0003, + "num_tokens": 316563.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 33.75, + "completions/mean_terminated_length": 33.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 19.203703703703702, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9393424987792969, + "kl": 0.17014753073453903, + "learning_rate": 2.988e-06, + "loss": -0.0586, + "num_tokens": 316926.0, + "reward": 4.25, + "reward_std": 2.1794495582580566, + "rewards/reward_combined/mean": 4.25, + "rewards/reward_combined/std": 2.1794495582580566, + "step": 1037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 19.22222222222222, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2093377113342285, + "kl": 0.016108008101582527, + "learning_rate": 2.987666666666667e-06, + "loss": 0.0019, + "num_tokens": 317255.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 1038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 19.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03911317139863968, + "kl": 0.0030389464809559286, + "learning_rate": 2.987333333333333e-06, + "loss": 0.0002, + "num_tokens": 317569.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 19.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00023904952104203403, + "kl": 0.00010310113430023193, + "learning_rate": 2.9870000000000004e-06, + "loss": 0.0, + "num_tokens": 317789.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 1040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 19.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1512259840965271, + "kl": 0.003034040331840515, + "learning_rate": 2.9866666666666667e-06, + "loss": 0.0002, + "num_tokens": 318001.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 19.296296296296298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11819654703140259, + "kl": 0.00561297032982111, + "learning_rate": 2.9863333333333335e-06, + "loss": 0.0003, + "num_tokens": 318244.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 19.314814814814813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08559706807136536, + "kl": 0.004623609886039048, + "learning_rate": 2.986e-06, + "loss": 0.0002, + "num_tokens": 318514.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 19.333333333333332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002773053478449583, + "kl": 0.0033064335584640503, + "learning_rate": 2.9856666666666667e-06, + "loss": 0.0002, + "num_tokens": 318750.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 19.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08593981713056564, + "kl": 0.023099279031157494, + "learning_rate": 2.9853333333333334e-06, + "loss": 0.0012, + "num_tokens": 319041.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 40.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 76.0, + "completions/max_terminated_length": 76.0, + "completions/mean_length": 40.0, + "completions/mean_terminated_length": 40.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 19.37037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5635592937469482, + "kl": 0.0808534175157547, + "learning_rate": 2.9850000000000002e-06, + "loss": 0.3652, + "num_tokens": 319421.0, + "reward": 1.75, + "reward_std": 3.947572946548462, + "rewards/reward_combined/mean": 1.75, + "rewards/reward_combined/std": 3.947573184967041, + "step": 1046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 19.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021194322034716606, + "kl": 0.0006624294037465006, + "learning_rate": 2.984666666666667e-06, + "loss": 0.0, + "num_tokens": 319742.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 34.75, + "completions/mean_terminated_length": 34.75, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 19.40740740740741, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14266392588615417, + "kl": 0.044841449707746506, + "learning_rate": 2.9843333333333334e-06, + "loss": 0.0022, + "num_tokens": 320105.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 19.425925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00994281005114317, + "kl": 0.0006901979213580489, + "learning_rate": 2.984e-06, + "loss": 0.0, + "num_tokens": 320325.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1049 + }, + { + "clip_ratio/high_max": 0.01315789483487606, + "clip_ratio/high_mean": 0.01315789483487606, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01315789483487606, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 19.444444444444443, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.721512317657471, + "kl": 0.15218839049339294, + "learning_rate": 2.9836666666666665e-06, + "loss": -0.0199, + "num_tokens": 320625.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 1050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 19.462962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004217702895402908, + "kl": 0.016147812828421593, + "learning_rate": 2.9833333333333333e-06, + "loss": 0.0008, + "num_tokens": 320885.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 19.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056700922548770905, + "kl": 0.001943156123161316, + "learning_rate": 2.983e-06, + "loss": 0.0001, + "num_tokens": 321104.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 19.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.6867146492004395, + "kl": 0.007495361380279064, + "learning_rate": 2.982666666666667e-06, + "loss": 0.0176, + "num_tokens": 321393.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 1053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 19.51851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 11.446792602539062, + "kl": 0.0003892386448569596, + "learning_rate": 2.982333333333333e-06, + "loss": 0.0249, + "num_tokens": 321690.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 1054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 19.537037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05403664708137512, + "kl": 0.005810579285025597, + "learning_rate": 2.982e-06, + "loss": 0.0003, + "num_tokens": 322022.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 19.555555555555557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018789371475577354, + "kl": 0.0008191429078578949, + "learning_rate": 2.9816666666666668e-06, + "loss": 0.0, + "num_tokens": 322282.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.75, + "completions/mean_terminated_length": 5.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 19.574074074074073, + "frac_reward_zero_std": 0.0, + "grad_norm": 13.297018051147461, + "kl": 0.05140158161520958, + "learning_rate": 2.9813333333333336e-06, + "loss": -0.1298, + "num_tokens": 322513.0, + "reward": 3.125, + "reward_std": 1.75, + "rewards/reward_combined/mean": 3.125, + "rewards/reward_combined/std": 1.75, + "step": 1057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 19.59259259259259, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.219817638397217, + "kl": 0.3111524060368538, + "learning_rate": 2.9810000000000003e-06, + "loss": 0.0311, + "num_tokens": 322815.0, + "reward": 4.0, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 4.041451930999756, + "step": 1058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 19.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014365162700414658, + "kl": 0.0004117531352676451, + "learning_rate": 2.9806666666666667e-06, + "loss": 0.0, + "num_tokens": 323095.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 19.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09150257706642151, + "kl": 0.14653567969799042, + "learning_rate": 2.9803333333333335e-06, + "loss": 0.0073, + "num_tokens": 323406.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 19.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08486936241388321, + "kl": 0.009652079083025455, + "learning_rate": 2.98e-06, + "loss": 0.0005, + "num_tokens": 323695.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 19.666666666666668, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.30423662066459656, + "kl": 0.024303349666297436, + "learning_rate": 2.9796666666666666e-06, + "loss": 0.0013, + "num_tokens": 323997.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 19.685185185185187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06946351379156113, + "kl": 0.00913584278896451, + "learning_rate": 2.9793333333333334e-06, + "loss": 0.0005, + "num_tokens": 324271.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 19.703703703703702, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13158056139945984, + "kl": 0.0073317347560077906, + "learning_rate": 2.979e-06, + "loss": 0.0004, + "num_tokens": 324527.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 19.72222222222222, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.627601146697998, + "kl": 0.07835894823074341, + "learning_rate": 2.978666666666667e-06, + "loss": 0.1095, + "num_tokens": 324810.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 1065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 19.74074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.50330114364624, + "kl": 0.012882797745987773, + "learning_rate": 2.9783333333333333e-06, + "loss": 0.2183, + "num_tokens": 325086.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 1066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 81.75, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 81.75, + "completions/mean_terminated_length": 23.666667938232422, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 19.75925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9908783435821533, + "kl": 0.036355357617139816, + "learning_rate": 2.978e-06, + "loss": 0.4558, + "num_tokens": 325665.0, + "reward": 2.924999952316284, + "reward_std": 5.566791534423828, + "rewards/reward_combined/mean": 2.924999952316284, + "rewards/reward_combined/std": 5.566791534423828, + "step": 1067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 19.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3228541612625122, + "kl": 0.10724344104528427, + "learning_rate": 2.9776666666666665e-06, + "loss": 0.0054, + "num_tokens": 325938.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 19.796296296296298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021391578018665314, + "kl": 0.009398100432008505, + "learning_rate": 2.9773333333333333e-06, + "loss": 0.0005, + "num_tokens": 326222.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 47.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 94.0, + "completions/max_terminated_length": 94.0, + "completions/mean_length": 47.5, + "completions/mean_terminated_length": 47.5, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 19.814814814814813, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.63429594039917, + "kl": 0.06384639628231525, + "learning_rate": 2.977e-06, + "loss": 0.22, + "num_tokens": 326628.0, + "reward": 2.375, + "reward_std": 1.314977765083313, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 1.3149778842926025, + "step": 1070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 19.833333333333332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0021955263800919056, + "kl": 0.00012518552830442786, + "learning_rate": 2.976666666666667e-06, + "loss": 0.0, + "num_tokens": 326936.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 1071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 33.25, + "completions/mean_terminated_length": 33.25, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 19.85185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8048208951950073, + "kl": 0.01688295044004917, + "learning_rate": 2.9763333333333336e-06, + "loss": -0.0015, + "num_tokens": 327349.0, + "reward": 1.625, + "reward_std": 1.6007810831069946, + "rewards/reward_combined/mean": 1.625, + "rewards/reward_combined/std": 1.6007810831069946, + "step": 1072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 19.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08759452402591705, + "kl": 0.006184890866279602, + "learning_rate": 2.976e-06, + "loss": 0.0003, + "num_tokens": 327557.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 19.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035650063306093216, + "kl": 0.002114921808242798, + "learning_rate": 2.9756666666666667e-06, + "loss": 0.0001, + "num_tokens": 327769.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.00909090880304575, + "clip_ratio/low_min": 0.00909090880304575, + "clip_ratio/region_mean": 0.00909090880304575, + "completion_length": 28.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 28.25, + "completions/mean_terminated_length": 28.25, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 19.90740740740741, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.1686177253723145, + "kl": 0.1576797552406788, + "learning_rate": 2.9753333333333335e-06, + "loss": -0.0046, + "num_tokens": 328102.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 1075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 19.925925925925927, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.954868793487549, + "kl": 0.01296089543029666, + "learning_rate": 2.9750000000000003e-06, + "loss": -0.0126, + "num_tokens": 328436.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 1076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 19.944444444444443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0020211064256727695, + "kl": 0.0009937775903381407, + "learning_rate": 2.9746666666666667e-06, + "loss": 0.0, + "num_tokens": 328716.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 24.75, + "completions/mean_terminated_length": 24.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 19.962962962962962, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.463118553161621, + "kl": 0.08824754506349564, + "learning_rate": 2.9743333333333335e-06, + "loss": 0.072, + "num_tokens": 329051.0, + "reward": 3.75, + "reward_std": 2.872281312942505, + "rewards/reward_combined/mean": 3.75, + "rewards/reward_combined/std": 2.872281312942505, + "step": 1078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 19.98148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.703159809112549, + "kl": 0.013251371681690216, + "learning_rate": 2.974e-06, + "loss": -0.0167, + "num_tokens": 329383.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 1079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 10.75, + "completions/mean_terminated_length": 10.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 20.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.30451029539108276, + "kl": 0.02358182705938816, + "learning_rate": 2.9736666666666666e-06, + "loss": 0.0011, + "num_tokens": 329646.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 35.25, + "completions/mean_terminated_length": 35.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 20.01851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.164877414703369, + "kl": 0.08919530734419823, + "learning_rate": 2.9733333333333334e-06, + "loss": -0.041, + "num_tokens": 330003.0, + "reward": 2.375, + "reward_std": 1.314977765083313, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 1.3149778842926025, + "step": 1081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 20.037037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09259263426065445, + "kl": 0.007950606057420373, + "learning_rate": 2.973e-06, + "loss": 0.0004, + "num_tokens": 330283.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 20.055555555555557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010594749823212624, + "kl": 0.00932190753519535, + "learning_rate": 2.972666666666667e-06, + "loss": 0.0005, + "num_tokens": 330555.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 20.074074074074073, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.488523483276367, + "kl": 0.18745465204119682, + "learning_rate": 2.9723333333333333e-06, + "loss": 0.0489, + "num_tokens": 330862.0, + "reward": 4.625, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 1084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 20.09259259259259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003421426983550191, + "kl": 0.01631779409945011, + "learning_rate": 2.972e-06, + "loss": 0.0008, + "num_tokens": 331122.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 20.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1423378735780716, + "kl": 0.013231783639639616, + "learning_rate": 2.9716666666666664e-06, + "loss": 0.0007, + "num_tokens": 331411.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 30.5, + "completions/mean_terminated_length": 30.5, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 20.12962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.477877140045166, + "kl": 0.1347007192671299, + "learning_rate": 2.9713333333333337e-06, + "loss": 0.0281, + "num_tokens": 331761.0, + "reward": 3.625, + "reward_std": 2.8975563049316406, + "rewards/reward_combined/mean": 3.625, + "rewards/reward_combined/std": 2.8975565433502197, + "step": 1087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 20.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.31940189003944397, + "kl": 0.02269493043422699, + "learning_rate": 2.971e-06, + "loss": 0.0011, + "num_tokens": 332017.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 20.166666666666668, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.857755184173584, + "kl": 0.02816795534454286, + "learning_rate": 2.970666666666667e-06, + "loss": 0.1688, + "num_tokens": 332298.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 1089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 20.185185185185187, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.73239278793335, + "kl": 0.10749359801411629, + "learning_rate": 2.9703333333333336e-06, + "loss": 0.011, + "num_tokens": 332624.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 1090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 32.5, + "completions/mean_terminated_length": 32.5, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 20.203703703703702, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.484701156616211, + "kl": 0.03071892447769642, + "learning_rate": 2.97e-06, + "loss": 0.0626, + "num_tokens": 332978.0, + "reward": 4.625, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 1091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 20.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002247113035991788, + "kl": 0.0034552812576293945, + "learning_rate": 2.9696666666666667e-06, + "loss": 0.0002, + "num_tokens": 333214.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1092 + }, + { + "clip_ratio/high_max": 0.007936508394777775, + "clip_ratio/high_mean": 0.007936508394777775, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007936508394777775, + "completion_length": 29.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 29.5, + "completions/mean_terminated_length": 29.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 20.24074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.9538395404815674, + "kl": 0.08735976181924343, + "learning_rate": 2.9693333333333335e-06, + "loss": 0.1933, + "num_tokens": 333584.0, + "reward": 3.5, + "reward_std": 2.915475845336914, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 2.915475845336914, + "step": 1093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 76.5, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 76.5, + "completions/mean_terminated_length": 16.666667938232422, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 20.25925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7457423210144043, + "kl": 0.19177453219890594, + "learning_rate": 2.9690000000000003e-06, + "loss": 0.444, + "num_tokens": 334130.0, + "reward": 5.050000190734863, + "reward_std": 5.900000095367432, + "rewards/reward_combined/mean": 5.050000190734863, + "rewards/reward_combined/std": 5.90000057220459, + "step": 1094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 20.27777777777778, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.478559494018555, + "kl": 0.03354105446487665, + "learning_rate": 2.9686666666666666e-06, + "loss": -0.0083, + "num_tokens": 334433.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 1095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 12.25, + "completions/mean_terminated_length": 12.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 20.296296296296298, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.359942436218262, + "kl": 0.01864597573876381, + "learning_rate": 2.9683333333333334e-06, + "loss": 0.2653, + "num_tokens": 334694.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 1096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 20.314814814814813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5284212231636047, + "kl": 0.031490376219153404, + "learning_rate": 2.968e-06, + "loss": 0.0017, + "num_tokens": 335024.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 20.333333333333332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002659357152879238, + "kl": 0.0008811780717223883, + "learning_rate": 2.9676666666666666e-06, + "loss": 0.0, + "num_tokens": 335304.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 2.5, + "completions/mean_terminated_length": 2.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 20.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11540081351995468, + "kl": 0.004415854811668396, + "learning_rate": 2.9673333333333334e-06, + "loss": 0.0002, + "num_tokens": 335510.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 20.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010740313678979874, + "kl": 0.0005187153728911653, + "learning_rate": 2.967e-06, + "loss": 0.0, + "num_tokens": 335730.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 20.38888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.564188003540039, + "kl": 0.11788275837898254, + "learning_rate": 2.966666666666667e-06, + "loss": 0.0346, + "num_tokens": 336022.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 1101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 20.40740740740741, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10571005940437317, + "kl": 0.009991541504859924, + "learning_rate": 2.9663333333333333e-06, + "loss": 0.0005, + "num_tokens": 336282.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 24.25, + "completions/mean_terminated_length": 24.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 20.425925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3314175605773926, + "kl": 0.11745191365480423, + "learning_rate": 2.966e-06, + "loss": 0.0058, + "num_tokens": 336607.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 28.0, + "completions/mean_terminated_length": 28.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 20.444444444444443, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.558254241943359, + "kl": 0.14987434446811676, + "learning_rate": 2.965666666666667e-06, + "loss": 0.2771, + "num_tokens": 336939.0, + "reward": 5.25, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 4.5, + "step": 1104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 20.462962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010132589377462864, + "kl": 0.000619823724264279, + "learning_rate": 2.9653333333333336e-06, + "loss": 0.0, + "num_tokens": 337211.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 20.48148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.5016937255859375, + "kl": 0.037582204677164555, + "learning_rate": 2.965e-06, + "loss": 0.0958, + "num_tokens": 337490.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 1106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 20.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 10.07843017578125, + "kl": 0.018026244826614857, + "learning_rate": 2.9646666666666668e-06, + "loss": 0.1158, + "num_tokens": 337792.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 1107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 20.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03571224585175514, + "kl": 0.001817658543586731, + "learning_rate": 2.9643333333333336e-06, + "loss": 0.0001, + "num_tokens": 338004.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 20.537037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15281568467617035, + "kl": 0.017196177504956722, + "learning_rate": 2.964e-06, + "loss": 0.0008, + "num_tokens": 338302.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 24.25, + "completions/mean_terminated_length": 24.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 20.555555555555557, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7225083112716675, + "kl": 0.1012876033782959, + "learning_rate": 2.9636666666666667e-06, + "loss": 0.0157, + "num_tokens": 338667.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 1110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 20.574074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.42082706093788147, + "kl": 0.031162254512310028, + "learning_rate": 2.9633333333333335e-06, + "loss": 0.0016, + "num_tokens": 338927.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 20.59259259259259, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.228433132171631, + "kl": 0.020230777096003294, + "learning_rate": 2.9630000000000003e-06, + "loss": -0.0263, + "num_tokens": 339200.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 1112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 20.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.639192581176758, + "kl": 0.09708906058222055, + "learning_rate": 2.9626666666666666e-06, + "loss": 0.0452, + "num_tokens": 339495.0, + "reward": 4.0, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 4.041451930999756, + "step": 1113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 20.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03252008184790611, + "kl": 0.003536023898050189, + "learning_rate": 2.9623333333333334e-06, + "loss": 0.0002, + "num_tokens": 339826.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 20.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016074860468506813, + "kl": 0.001966603100299835, + "learning_rate": 2.9619999999999998e-06, + "loss": 0.0001, + "num_tokens": 340042.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 20.666666666666668, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002983392623718828, + "kl": 9.645521640777588e-05, + "learning_rate": 2.9616666666666665e-06, + "loss": 0.0, + "num_tokens": 340262.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 1116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 20.685185185185187, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.892916440963745, + "kl": 0.005409277277067304, + "learning_rate": 2.9613333333333338e-06, + "loss": 0.0002, + "num_tokens": 340542.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 1117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 20.703703703703702, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.1110639572143555, + "kl": 0.12851876206696033, + "learning_rate": 2.961e-06, + "loss": 0.148, + "num_tokens": 340819.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 1118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 20.72222222222222, + "frac_reward_zero_std": 0.0, + "grad_norm": 10.510836601257324, + "kl": 0.036147153936326504, + "learning_rate": 2.960666666666667e-06, + "loss": 0.2088, + "num_tokens": 341063.0, + "reward": 3.125, + "reward_std": 1.75, + "rewards/reward_combined/mean": 3.125, + "rewards/reward_combined/std": 1.75, + "step": 1119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 20.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006750933825969696, + "kl": 0.0021908581256866455, + "learning_rate": 2.9603333333333333e-06, + "loss": 0.0001, + "num_tokens": 341323.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 20.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049142204225063324, + "kl": 0.0022561585064977407, + "learning_rate": 2.96e-06, + "loss": 0.0001, + "num_tokens": 341631.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 1121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 20.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06688922643661499, + "kl": 0.008426931453868747, + "learning_rate": 2.959666666666667e-06, + "loss": 0.0004, + "num_tokens": 341960.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 20.796296296296298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19637010991573334, + "kl": 0.015841126441955566, + "learning_rate": 2.9593333333333336e-06, + "loss": 0.0007, + "num_tokens": 342272.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 20.814814814814813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058728717267513275, + "kl": 0.00821069278754294, + "learning_rate": 2.959e-06, + "loss": 0.0004, + "num_tokens": 342604.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 37.75, + "completions/mean_terminated_length": 37.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 20.833333333333332, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.364965796470642, + "kl": 0.0349162295460701, + "learning_rate": 2.9586666666666667e-06, + "loss": 0.0524, + "num_tokens": 343035.0, + "reward": 2.799999952316284, + "reward_std": 0.4000000059604645, + "rewards/reward_combined/mean": 2.799999952316284, + "rewards/reward_combined/std": 0.4000000059604645, + "step": 1125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 20.85185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.671823978424072, + "kl": 0.019271957222372293, + "learning_rate": 2.9583333333333335e-06, + "loss": 0.0892, + "num_tokens": 343304.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 1126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 20.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5664737224578857, + "kl": 0.05136868730187416, + "learning_rate": 2.958e-06, + "loss": 0.0035, + "num_tokens": 343573.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 20.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004015078768134117, + "kl": 0.00041546672582626343, + "learning_rate": 2.9576666666666667e-06, + "loss": 0.0, + "num_tokens": 343785.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 20.90740740740741, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7527806758880615, + "kl": 0.07438867166638374, + "learning_rate": 2.9573333333333335e-06, + "loss": 0.0925, + "num_tokens": 344137.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 1129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 20.925925925925927, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.403362274169922, + "kl": 0.023505443707108498, + "learning_rate": 2.9570000000000002e-06, + "loss": 0.1409, + "num_tokens": 344424.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 1130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 20.944444444444443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028070291504263878, + "kl": 0.0003178758197464049, + "learning_rate": 2.9566666666666666e-06, + "loss": 0.0, + "num_tokens": 344704.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 20.962962962962962, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.751282215118408, + "kl": 0.09102287143468857, + "learning_rate": 2.9563333333333334e-06, + "loss": -0.0446, + "num_tokens": 345032.0, + "reward": 4.125, + "reward_std": 2.25, + "rewards/reward_combined/mean": 4.125, + "rewards/reward_combined/std": 2.25, + "step": 1132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 20.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08037609606981277, + "kl": 0.011075216345489025, + "learning_rate": 2.9559999999999997e-06, + "loss": 0.0006, + "num_tokens": 345344.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 21.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08196492493152618, + "kl": 0.005020148353651166, + "learning_rate": 2.955666666666667e-06, + "loss": 0.0003, + "num_tokens": 345628.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 21.01851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.798346042633057, + "kl": 0.06818924844264984, + "learning_rate": 2.9553333333333337e-06, + "loss": -0.009, + "num_tokens": 345927.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 1135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 21.037037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24431799352169037, + "kl": 0.04779178276658058, + "learning_rate": 2.955e-06, + "loss": 0.0023, + "num_tokens": 346209.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 21.055555555555557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12148759514093399, + "kl": 0.01206190837547183, + "learning_rate": 2.954666666666667e-06, + "loss": 0.0006, + "num_tokens": 346513.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 21.074074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1359328031539917, + "kl": 0.012502173180109821, + "learning_rate": 2.9543333333333332e-06, + "loss": 0.0004, + "num_tokens": 346811.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 1138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3.0, + "completions/max_terminated_length": 3.0, + "completions/mean_length": 2.25, + "completions/mean_terminated_length": 2.25, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 21.09259259259259, + "frac_reward_zero_std": 0.0, + "grad_norm": 10.664088249206543, + "kl": 0.8295629173517227, + "learning_rate": 2.954e-06, + "loss": 0.1508, + "num_tokens": 347024.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 1139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 21.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008147433400154114, + "kl": 0.0007927305996417999, + "learning_rate": 2.953666666666667e-06, + "loss": 0.0, + "num_tokens": 347268.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 21.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0023207135964185, + "kl": 0.016516927629709244, + "learning_rate": 2.9533333333333336e-06, + "loss": 0.0008, + "num_tokens": 347528.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 21.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23874467611312866, + "kl": 0.011130547791253775, + "learning_rate": 2.953e-06, + "loss": 0.0005, + "num_tokens": 347796.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 21.166666666666668, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.717414855957031, + "kl": 0.07371293380856514, + "learning_rate": 2.9526666666666667e-06, + "loss": 0.0207, + "num_tokens": 348089.0, + "reward": 6.75, + "reward_std": 2.1794495582580566, + "rewards/reward_combined/mean": 6.75, + "rewards/reward_combined/std": 2.1794495582580566, + "step": 1143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 2.5, + "completions/mean_terminated_length": 2.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 21.185185185185187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012407519854605198, + "kl": 0.0006310045719146729, + "learning_rate": 2.9523333333333335e-06, + "loss": 0.0, + "num_tokens": 348295.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 59.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 136.0, + "completions/max_terminated_length": 136.0, + "completions/mean_length": 59.5, + "completions/mean_terminated_length": 59.5, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 21.203703703703702, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0559470653533936, + "kl": 0.07835566624999046, + "learning_rate": 2.952e-06, + "loss": 0.36, + "num_tokens": 348753.0, + "reward": 2.75, + "reward_std": 2.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 2.5, + "step": 1145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 21.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008729362860321999, + "kl": 0.0004869153199251741, + "learning_rate": 2.9516666666666666e-06, + "loss": 0.0, + "num_tokens": 349025.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 21.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02897246740758419, + "kl": 0.006061029154807329, + "learning_rate": 2.9513333333333334e-06, + "loss": 0.0003, + "num_tokens": 349313.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 5.25, + "completions/mean_terminated_length": 5.25, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 21.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3830954134464264, + "kl": 0.01743863639421761, + "learning_rate": 2.951e-06, + "loss": 0.0009, + "num_tokens": 349534.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 21.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05649878457188606, + "kl": 0.004728741245344281, + "learning_rate": 2.9506666666666666e-06, + "loss": 0.0002, + "num_tokens": 349851.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 21.296296296296298, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.983824253082275, + "kl": 0.06738797202706337, + "learning_rate": 2.9503333333333333e-06, + "loss": 0.2527, + "num_tokens": 350175.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 1150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 21.314814814814813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040783047676086426, + "kl": 0.005734084872528911, + "learning_rate": 2.9499999999999997e-06, + "loss": 0.0003, + "num_tokens": 350445.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 21.333333333333332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10436109453439713, + "kl": 0.011295586824417114, + "learning_rate": 2.949666666666667e-06, + "loss": 0.0006, + "num_tokens": 350705.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 7.0, + "completions/mean_terminated_length": 7.0, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 21.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017738359048962593, + "kl": 0.0003789237671298906, + "learning_rate": 2.9493333333333337e-06, + "loss": 0.0, + "num_tokens": 350941.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 21.37037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.230639934539795, + "kl": 0.026744220405817032, + "learning_rate": 2.949e-06, + "loss": 0.0377, + "num_tokens": 351229.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 1154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 21.38888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 11.081344604492188, + "kl": 0.36256470531225204, + "learning_rate": 2.948666666666667e-06, + "loss": -0.2009, + "num_tokens": 351548.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 1155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 21.40740740740741, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04915767163038254, + "kl": 0.005341783398762345, + "learning_rate": 2.948333333333333e-06, + "loss": 0.0003, + "num_tokens": 351885.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 37.25, + "completions/mean_terminated_length": 37.25, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 21.425925925925927, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.716984748840332, + "kl": 0.044003942515701056, + "learning_rate": 2.948e-06, + "loss": 0.0012, + "num_tokens": 352314.0, + "reward": 1.5499999523162842, + "reward_std": 1.2556538581848145, + "rewards/reward_combined/mean": 1.5499999523162842, + "rewards/reward_combined/std": 1.2556538581848145, + "step": 1157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 21.444444444444443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026875631883740425, + "kl": 0.006697945529595017, + "learning_rate": 2.9476666666666668e-06, + "loss": 0.0003, + "num_tokens": 352636.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 21.462962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012115325778722763, + "kl": 0.001387697469908744, + "learning_rate": 2.9473333333333335e-06, + "loss": 0.0001, + "num_tokens": 352896.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 21.48148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.894453048706055, + "kl": 0.020720298402011395, + "learning_rate": 2.947e-06, + "loss": 0.0279, + "num_tokens": 353171.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 1160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 21.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.975372791290283, + "kl": 0.1301591955125332, + "learning_rate": 2.9466666666666667e-06, + "loss": 0.1615, + "num_tokens": 353496.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 1161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 21.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19771425426006317, + "kl": 0.007606446743011475, + "learning_rate": 2.9463333333333335e-06, + "loss": 0.0004, + "num_tokens": 353756.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 21.537037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2030460387468338, + "kl": 0.018800528720021248, + "learning_rate": 2.946e-06, + "loss": 0.001, + "num_tokens": 354021.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 21.555555555555557, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.497882604598999, + "kl": 0.0030176237924024463, + "learning_rate": 2.945666666666667e-06, + "loss": 0.0004, + "num_tokens": 354281.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 1164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 21.574074074074073, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.182275295257568, + "kl": 0.014296102803200483, + "learning_rate": 2.9453333333333334e-06, + "loss": 0.011, + "num_tokens": 354612.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 1165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 21.59259259259259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11543595045804977, + "kl": 0.011262784712016582, + "learning_rate": 2.945e-06, + "loss": 0.0006, + "num_tokens": 354905.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 47.0, + "completions/max_terminated_length": 47.0, + "completions/mean_length": 31.5, + "completions/mean_terminated_length": 31.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 21.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.05928897857666, + "kl": 0.1998056210577488, + "learning_rate": 2.9446666666666665e-06, + "loss": 0.0869, + "num_tokens": 355259.0, + "reward": 3.5, + "reward_std": 2.915475845336914, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 2.915475845336914, + "step": 1167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 21.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0020188773050904274, + "kl": 0.0010549085563980043, + "learning_rate": 2.9443333333333333e-06, + "loss": 0.0001, + "num_tokens": 355539.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 21.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01794954389333725, + "kl": 0.003737920429557562, + "learning_rate": 2.944e-06, + "loss": 0.0002, + "num_tokens": 355795.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 21.666666666666668, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08125066757202148, + "kl": 0.0040408282075077295, + "learning_rate": 2.943666666666667e-06, + "loss": 0.0002, + "num_tokens": 356109.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 21.685185185185187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23854506015777588, + "kl": 0.005255371332168579, + "learning_rate": 2.9433333333333337e-06, + "loss": 0.0003, + "num_tokens": 356325.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 21.703703703703702, + "frac_reward_zero_std": 0.0, + "grad_norm": 10.874303817749023, + "kl": 0.07863211538642645, + "learning_rate": 2.943e-06, + "loss": 0.1966, + "num_tokens": 356613.0, + "reward": 6.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 6.5, + "rewards/reward_combined/std": 2.0, + "step": 1172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 21.72222222222222, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.840372085571289, + "kl": 0.07953836768865585, + "learning_rate": 2.942666666666667e-06, + "loss": -0.0632, + "num_tokens": 356901.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 1173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 21.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002117764379363507, + "kl": 0.00011216849088668823, + "learning_rate": 2.942333333333333e-06, + "loss": 0.0, + "num_tokens": 357121.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 1174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 45.0, + "completions/max_terminated_length": 45.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 21.75925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.825308322906494, + "kl": 0.03514570742845535, + "learning_rate": 2.942e-06, + "loss": -0.1674, + "num_tokens": 357462.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 1175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 21.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06371951848268509, + "kl": 0.010883115697652102, + "learning_rate": 2.9416666666666667e-06, + "loss": 0.0005, + "num_tokens": 357734.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 21.796296296296298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3431745767593384, + "kl": 0.03118173498660326, + "learning_rate": 2.9413333333333335e-06, + "loss": 0.0015, + "num_tokens": 358038.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 21.814814814814813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03796914219856262, + "kl": 0.007035271963104606, + "learning_rate": 2.941e-06, + "loss": 0.0004, + "num_tokens": 358320.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 21.833333333333332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11631365865468979, + "kl": 0.012384260538965464, + "learning_rate": 2.9406666666666667e-06, + "loss": 0.0006, + "num_tokens": 358654.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 21.85185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.747598171234131, + "kl": 0.16620274633169174, + "learning_rate": 2.9403333333333334e-06, + "loss": -0.1208, + "num_tokens": 358979.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 1180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 21.87037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3009033203125, + "kl": 0.1012641042470932, + "learning_rate": 2.9400000000000002e-06, + "loss": -0.0134, + "num_tokens": 359341.0, + "reward": 2.25, + "reward_std": 1.443375587463379, + "rewards/reward_combined/mean": 2.25, + "rewards/reward_combined/std": 1.4433757066726685, + "step": 1181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 21.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02373761124908924, + "kl": 0.008252784609794617, + "learning_rate": 2.939666666666667e-06, + "loss": 0.0004, + "num_tokens": 359609.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 21.90740740740741, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04895434156060219, + "kl": 0.00742289237678051, + "learning_rate": 2.9393333333333334e-06, + "loss": 0.0004, + "num_tokens": 359921.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 26.75, + "completions/mean_terminated_length": 26.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 21.925925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2559320628643036, + "kl": 0.12430773675441742, + "learning_rate": 2.939e-06, + "loss": 0.0061, + "num_tokens": 360252.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 21.944444444444443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002980705350637436, + "kl": 0.0005115270614624023, + "learning_rate": 2.9386666666666665e-06, + "loss": 0.0, + "num_tokens": 360464.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 21.962962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00200333702377975, + "kl": 0.0035089924931526184, + "learning_rate": 2.9383333333333333e-06, + "loss": 0.0002, + "num_tokens": 360700.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 21.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5305435061454773, + "kl": 0.016145928762853146, + "learning_rate": 2.938e-06, + "loss": 0.0008, + "num_tokens": 360996.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.75, + "completions/mean_terminated_length": 23.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 22.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9100046157836914, + "kl": 0.14727820456027985, + "learning_rate": 2.937666666666667e-06, + "loss": 0.0545, + "num_tokens": 361327.0, + "reward": 3.375, + "reward_std": 0.25, + "rewards/reward_combined/mean": 3.375, + "rewards/reward_combined/std": 0.25, + "step": 1188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 22.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003309160703793168, + "kl": 0.0005266964435577393, + "learning_rate": 2.9373333333333336e-06, + "loss": 0.0, + "num_tokens": 361539.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 22.037037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5165593028068542, + "kl": 0.0408717580139637, + "learning_rate": 2.937e-06, + "loss": 0.002, + "num_tokens": 361829.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 22.055555555555557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04083992540836334, + "kl": 0.0015979359450284392, + "learning_rate": 2.9366666666666668e-06, + "loss": 0.0001, + "num_tokens": 362141.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 22.074074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18477961421012878, + "kl": 0.00941312313079834, + "learning_rate": 2.936333333333333e-06, + "loss": 0.0005, + "num_tokens": 362357.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 22.09259259259259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2500711679458618, + "kl": 0.030659684911370277, + "learning_rate": 2.936e-06, + "loss": 0.0015, + "num_tokens": 362686.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 22.11111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.044258117675781, + "kl": 0.00922396220266819, + "learning_rate": 2.9356666666666667e-06, + "loss": 0.0593, + "num_tokens": 363002.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 1194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 22.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07881251722574234, + "kl": 0.00452762097120285, + "learning_rate": 2.9353333333333335e-06, + "loss": 0.0002, + "num_tokens": 363262.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 22.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001921364339068532, + "kl": 0.016606775112450123, + "learning_rate": 2.9350000000000003e-06, + "loss": 0.0008, + "num_tokens": 363522.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 22.166666666666668, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034760188311338425, + "kl": 0.0016343023162335157, + "learning_rate": 2.9346666666666666e-06, + "loss": 0.0001, + "num_tokens": 363800.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 22.185185185185187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6671550869941711, + "kl": 0.060452768579125404, + "learning_rate": 2.9343333333333334e-06, + "loss": 0.0038, + "num_tokens": 364077.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 12.75, + "completions/mean_terminated_length": 12.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 22.203703703703702, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.49239590764045715, + "kl": 0.028446731623262167, + "learning_rate": 2.934e-06, + "loss": 0.0019, + "num_tokens": 364348.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 22.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09381739050149918, + "kl": 0.006515514440252446, + "learning_rate": 2.933666666666667e-06, + "loss": 0.0003, + "num_tokens": 364583.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 22.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08807060867547989, + "kl": 0.004534887499175966, + "learning_rate": 2.9333333333333333e-06, + "loss": 0.0002, + "num_tokens": 364879.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 22.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19829215109348297, + "kl": 0.03475787350907922, + "learning_rate": 2.933e-06, + "loss": 0.0017, + "num_tokens": 365155.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 22.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0021598278544843197, + "kl": 0.0034804120659828186, + "learning_rate": 2.9326666666666665e-06, + "loss": 0.0002, + "num_tokens": 365391.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 22.296296296296298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.093035988509655, + "kl": 0.043035659939050674, + "learning_rate": 2.9323333333333333e-06, + "loss": 0.0021, + "num_tokens": 365694.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 22.314814814814813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03102559968829155, + "kl": 0.007453024387359619, + "learning_rate": 2.932e-06, + "loss": 0.0004, + "num_tokens": 365906.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 30.25, + "completions/mean_terminated_length": 30.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 22.333333333333332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06748061627149582, + "kl": 0.008585278643295169, + "learning_rate": 2.931666666666667e-06, + "loss": 0.0004, + "num_tokens": 366255.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 22.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06693046540021896, + "kl": 0.13564538955688477, + "learning_rate": 2.9313333333333336e-06, + "loss": 0.0066, + "num_tokens": 366571.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 22.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16144220530986786, + "kl": 0.05471273045986891, + "learning_rate": 2.931e-06, + "loss": 0.0027, + "num_tokens": 366863.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 28.25, + "completions/mean_terminated_length": 28.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 22.38888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1028034687042236, + "kl": 0.18381104990839958, + "learning_rate": 2.9306666666666668e-06, + "loss": 0.2272, + "num_tokens": 367212.0, + "reward": 3.25, + "reward_std": 0.28867512941360474, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 0.28867512941360474, + "step": 1209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 22.40740740740741, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020064125419594347, + "kl": 0.00011434406042098999, + "learning_rate": 2.930333333333333e-06, + "loss": 0.0, + "num_tokens": 367432.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 1210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 23.75, + "completions/mean_terminated_length": 23.75, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 22.425925925925927, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1972577571868896, + "kl": 0.12325546145439148, + "learning_rate": 2.9300000000000003e-06, + "loss": -0.003, + "num_tokens": 367795.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 1211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 22.444444444444443, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.3981404304504395, + "kl": 0.008597993873991072, + "learning_rate": 2.9296666666666667e-06, + "loss": -0.0112, + "num_tokens": 368129.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 1212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 34.0, + "completions/mean_terminated_length": 34.0, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 22.462962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07383531332015991, + "kl": 0.04913156945258379, + "learning_rate": 2.9293333333333335e-06, + "loss": 0.0023, + "num_tokens": 368489.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 22.48148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.8061723709106445, + "kl": 0.017924664542078972, + "learning_rate": 2.9290000000000002e-06, + "loss": 0.0294, + "num_tokens": 368790.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 1214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 22.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016739238053560257, + "kl": 0.0011711865663528442, + "learning_rate": 2.9286666666666666e-06, + "loss": 0.0001, + "num_tokens": 369070.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 22.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005845101550221443, + "kl": 0.017530377954244614, + "learning_rate": 2.9283333333333334e-06, + "loss": 0.0009, + "num_tokens": 369354.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 22.537037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2753112018108368, + "kl": 0.04153232462704182, + "learning_rate": 2.928e-06, + "loss": 0.0021, + "num_tokens": 369687.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 22.555555555555557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05116073042154312, + "kl": 0.002799829700961709, + "learning_rate": 2.927666666666667e-06, + "loss": 0.0001, + "num_tokens": 369947.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 22.574074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14668123424053192, + "kl": 0.01994048012420535, + "learning_rate": 2.9273333333333333e-06, + "loss": 0.0011, + "num_tokens": 370229.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 22.59259259259259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10535865277051926, + "kl": 0.014556014444679022, + "learning_rate": 2.927e-06, + "loss": 0.0007, + "num_tokens": 370502.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 22.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.075160026550293, + "kl": 0.023662267718464136, + "learning_rate": 2.9266666666666665e-06, + "loss": 0.14, + "num_tokens": 370849.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 1221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 22.62962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.379148960113525, + "kl": 0.07176013104617596, + "learning_rate": 2.9263333333333332e-06, + "loss": 0.007, + "num_tokens": 371141.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 1222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 27.5, + "completions/mean_terminated_length": 27.5, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 22.64814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.767480850219727, + "kl": 0.3034612610936165, + "learning_rate": 2.926e-06, + "loss": 0.0001, + "num_tokens": 371467.0, + "reward": 1.625, + "reward_std": 1.25, + "rewards/reward_combined/mean": 1.625, + "rewards/reward_combined/std": 1.25, + "step": 1223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 22.666666666666668, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.768539905548096, + "kl": 0.0630022007972002, + "learning_rate": 2.925666666666667e-06, + "loss": -0.0181, + "num_tokens": 371811.0, + "reward": 4.375, + "reward_std": 4.190763473510742, + "rewards/reward_combined/mean": 4.375, + "rewards/reward_combined/std": 4.190763473510742, + "step": 1224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 22.685185185185187, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.3131734132766724, + "kl": 0.1380967851728201, + "learning_rate": 2.9253333333333336e-06, + "loss": 0.0074, + "num_tokens": 372085.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1225 + }, + { + "clip_ratio/high_max": 0.008620689623057842, + "clip_ratio/high_mean": 0.008620689623057842, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008620689623057842, + "completion_length": 29.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 29.5, + "completions/mean_terminated_length": 29.5, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 22.703703703703702, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9731311798095703, + "kl": 0.04745516739785671, + "learning_rate": 2.925e-06, + "loss": -0.006, + "num_tokens": 372431.0, + "reward": 5.5, + "reward_std": 2.6140644550323486, + "rewards/reward_combined/mean": 5.5, + "rewards/reward_combined/std": 2.6140644550323486, + "step": 1226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 22.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008209033869206905, + "kl": 0.0010252483189105988, + "learning_rate": 2.9246666666666667e-06, + "loss": 0.0001, + "num_tokens": 372675.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 22.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005959033966064453, + "kl": 0.002459391951560974, + "learning_rate": 2.9243333333333335e-06, + "loss": 0.0001, + "num_tokens": 372935.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 22.75925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.888219833374023, + "kl": 0.012552839703857899, + "learning_rate": 2.9240000000000003e-06, + "loss": -0.033, + "num_tokens": 373225.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 1229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 22.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.72470223903656, + "kl": 0.16640783549519256, + "learning_rate": 2.9236666666666667e-06, + "loss": 0.0074, + "num_tokens": 373444.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 22.796296296296298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.34498071670532227, + "kl": 0.07076587900519371, + "learning_rate": 2.9233333333333334e-06, + "loss": 0.0035, + "num_tokens": 373719.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 22.814814814814813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.360834002494812, + "kl": 0.030715636909008026, + "learning_rate": 2.9230000000000002e-06, + "loss": 0.0015, + "num_tokens": 373991.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 22.833333333333332, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.977489471435547, + "kl": 0.10888610035181046, + "learning_rate": 2.9226666666666666e-06, + "loss": 0.0273, + "num_tokens": 374328.0, + "reward": 4.625, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 1233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 22.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054354228079319, + "kl": 0.0016398903680965304, + "learning_rate": 2.9223333333333334e-06, + "loss": 0.0001, + "num_tokens": 374634.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 1234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 22.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07077593356370926, + "kl": 0.03445947263389826, + "learning_rate": 2.922e-06, + "loss": 0.0017, + "num_tokens": 374946.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 33.5, + "completions/mean_terminated_length": 33.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 22.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07003846764564514, + "kl": 0.04414568841457367, + "learning_rate": 2.921666666666667e-06, + "loss": 0.0021, + "num_tokens": 375360.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 1236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 22.90740740740741, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17434364557266235, + "kl": 0.011220123968087137, + "learning_rate": 2.9213333333333333e-06, + "loss": 0.0006, + "num_tokens": 375658.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 22.925925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008249103091657162, + "kl": 0.0030403323471546173, + "learning_rate": 2.921e-06, + "loss": 0.0001, + "num_tokens": 375930.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 22.944444444444443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8902098536491394, + "kl": 0.04571262560784817, + "learning_rate": 2.9206666666666664e-06, + "loss": 0.0023, + "num_tokens": 376186.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 22.962962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10242968797683716, + "kl": 0.009457210544496775, + "learning_rate": 2.9203333333333332e-06, + "loss": 0.0005, + "num_tokens": 376452.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 22.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7022817730903625, + "kl": 0.1087116226553917, + "learning_rate": 2.9200000000000004e-06, + "loss": 0.0051, + "num_tokens": 376774.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 3.25, + "completions/mean_terminated_length": 3.25, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 23.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 35.80710983276367, + "kl": 0.08410745114088058, + "learning_rate": 2.9196666666666668e-06, + "loss": 0.2291, + "num_tokens": 376983.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 1242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 23.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09875809401273727, + "kl": 0.0028782979061361402, + "learning_rate": 2.9193333333333336e-06, + "loss": 0.0001, + "num_tokens": 377239.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 10.75, + "completions/mean_terminated_length": 10.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 23.037037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050571467727422714, + "kl": 0.0026585019659250975, + "learning_rate": 2.919e-06, + "loss": 0.0002, + "num_tokens": 377482.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 23.055555555555557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039863619953393936, + "kl": 0.0019241442787460983, + "learning_rate": 2.9186666666666667e-06, + "loss": 0.0001, + "num_tokens": 377790.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 1245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 23.074074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08739642798900604, + "kl": 0.009353132452815771, + "learning_rate": 2.9183333333333335e-06, + "loss": 0.0005, + "num_tokens": 378092.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 23.09259259259259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10864747315645218, + "kl": 0.003104984760284424, + "learning_rate": 2.9180000000000003e-06, + "loss": 0.0002, + "num_tokens": 378308.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 23.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06950744986534119, + "kl": 0.010574434418231249, + "learning_rate": 2.9176666666666666e-06, + "loss": 0.0006, + "num_tokens": 378643.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 23.12962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.0016770362854, + "kl": 0.0061819166876375675, + "learning_rate": 2.9173333333333334e-06, + "loss": 0.3682, + "num_tokens": 378947.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 1249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 23.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01288084127008915, + "kl": 0.005341643700376153, + "learning_rate": 2.917e-06, + "loss": 0.0003, + "num_tokens": 379215.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 23.166666666666668, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.228269577026367, + "kl": 0.031007222831249237, + "learning_rate": 2.9166666666666666e-06, + "loss": 0.0062, + "num_tokens": 379479.0, + "reward": 1.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 1.25, + "rewards/reward_combined/std": 1.5, + "step": 1251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 23.185185185185187, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.015903472900391, + "kl": 0.00810985779389739, + "learning_rate": 2.9163333333333333e-06, + "loss": 0.1962, + "num_tokens": 379812.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 1252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 23.203703703703702, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00765201635658741, + "kl": 0.0014866248238831758, + "learning_rate": 2.916e-06, + "loss": 0.0001, + "num_tokens": 380072.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 23.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11550978571176529, + "kl": 0.07717056572437286, + "learning_rate": 2.915666666666667e-06, + "loss": 0.0039, + "num_tokens": 380434.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 23.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3842141032218933, + "kl": 0.056036993861198425, + "learning_rate": 2.9153333333333333e-06, + "loss": 0.0027, + "num_tokens": 380728.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 23.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3481963276863098, + "kl": 0.0385284349322319, + "learning_rate": 2.915e-06, + "loss": 0.0023, + "num_tokens": 381050.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.75, + "completions/mean_terminated_length": 5.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 23.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2472708374261856, + "kl": 0.01906286645680666, + "learning_rate": 2.9146666666666664e-06, + "loss": 0.0009, + "num_tokens": 381273.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 23.296296296296298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.705979585647583, + "kl": 0.043598782271146774, + "learning_rate": 2.9143333333333336e-06, + "loss": 0.0022, + "num_tokens": 381517.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 23.314814814814813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003512790659442544, + "kl": 0.0031919777393341064, + "learning_rate": 2.9140000000000004e-06, + "loss": 0.0002, + "num_tokens": 381753.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 23.333333333333332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018005406484007835, + "kl": 0.0005531683564186096, + "learning_rate": 2.9136666666666668e-06, + "loss": 0.0, + "num_tokens": 381965.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 23.35185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.750605821609497, + "kl": 0.009492204524576664, + "learning_rate": 2.9133333333333335e-06, + "loss": 0.0006, + "num_tokens": 382237.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 1261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 23.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006020226515829563, + "kl": 0.01743131224066019, + "learning_rate": 2.913e-06, + "loss": 0.0009, + "num_tokens": 382521.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 45.0, + "completions/max_terminated_length": 45.0, + "completions/mean_length": 34.75, + "completions/mean_terminated_length": 34.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 23.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05591482296586037, + "kl": 0.0333207193762064, + "learning_rate": 2.9126666666666667e-06, + "loss": 0.0016, + "num_tokens": 382940.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 1263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 23.40740740740741, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20767620205879211, + "kl": 0.00659352820366621, + "learning_rate": 2.9123333333333335e-06, + "loss": 0.0003, + "num_tokens": 383236.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 23.425925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23847971856594086, + "kl": 0.08137239515781403, + "learning_rate": 2.9120000000000002e-06, + "loss": 0.0041, + "num_tokens": 383556.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 23.444444444444443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.36015400290489197, + "kl": 0.023563608527183533, + "learning_rate": 2.9116666666666666e-06, + "loss": 0.0012, + "num_tokens": 383764.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 23.462962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18822342157363892, + "kl": 0.022435004822909832, + "learning_rate": 2.9113333333333334e-06, + "loss": 0.0011, + "num_tokens": 384048.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 23.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054273031651973724, + "kl": 0.004214713117107749, + "learning_rate": 2.911e-06, + "loss": 0.0002, + "num_tokens": 384371.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 23.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.523254871368408, + "kl": 0.022931042592972517, + "learning_rate": 2.9106666666666665e-06, + "loss": -0.0228, + "num_tokens": 384699.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 1269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 23.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0046770935878157616, + "kl": 0.016160299070179462, + "learning_rate": 2.9103333333333333e-06, + "loss": 0.0008, + "num_tokens": 384959.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 23.537037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.27035823464393616, + "kl": 0.04531625285744667, + "learning_rate": 2.91e-06, + "loss": 0.0024, + "num_tokens": 385248.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 23.555555555555557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.33856290578842163, + "kl": 0.03045171545818448, + "learning_rate": 2.909666666666667e-06, + "loss": 0.0018, + "num_tokens": 385522.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 39.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 39.75, + "completions/mean_terminated_length": 39.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 23.574074074074073, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5626940727233887, + "kl": 0.02123592747375369, + "learning_rate": 2.9093333333333332e-06, + "loss": 0.1222, + "num_tokens": 385905.0, + "reward": 5.25, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 4.5, + "step": 1273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 23.59259259259259, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.170790672302246, + "kl": 0.2881488502025604, + "learning_rate": 2.909e-06, + "loss": 0.0368, + "num_tokens": 386208.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 1274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 23.25, + "completions/mean_terminated_length": 23.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 23.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.0086846351623535, + "kl": 0.015922888182103634, + "learning_rate": 2.9086666666666664e-06, + "loss": 0.1219, + "num_tokens": 386549.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 1275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 23.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6355563402175903, + "kl": 0.1421504244208336, + "learning_rate": 2.9083333333333336e-06, + "loss": 0.007, + "num_tokens": 386888.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 29.5, + "completions/mean_terminated_length": 29.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 23.64814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.170419216156006, + "kl": 0.029172319918870926, + "learning_rate": 2.9080000000000004e-06, + "loss": -0.0429, + "num_tokens": 387234.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 1277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 30.75, + "completions/mean_terminated_length": 30.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 23.666666666666668, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.438594341278076, + "kl": 0.07794092409312725, + "learning_rate": 2.9076666666666667e-06, + "loss": -0.0425, + "num_tokens": 387585.0, + "reward": 3.75, + "reward_std": 2.723355770111084, + "rewards/reward_combined/mean": 3.75, + "rewards/reward_combined/std": 2.723355770111084, + "step": 1278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 23.685185185185187, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.892442464828491, + "kl": 0.0718070799484849, + "learning_rate": 2.9073333333333335e-06, + "loss": 0.1846, + "num_tokens": 387892.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 1279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 36.0, + "completions/mean_terminated_length": 36.0, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 23.703703703703702, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.676334381103516, + "kl": 0.06440005823969841, + "learning_rate": 2.907e-06, + "loss": 0.1175, + "num_tokens": 388252.0, + "reward": 2.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 2.25, + "step": 1280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 23.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10585566610097885, + "kl": 0.015857995487749577, + "learning_rate": 2.9066666666666666e-06, + "loss": 0.0008, + "num_tokens": 388541.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 23.74074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.429829120635986, + "kl": 0.1935255378484726, + "learning_rate": 2.9063333333333334e-06, + "loss": 0.0604, + "num_tokens": 388841.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 1282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 23.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001566532882861793, + "kl": 0.00011816620826721191, + "learning_rate": 2.9060000000000002e-06, + "loss": 0.0, + "num_tokens": 389061.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 1283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 23.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0020859844516962767, + "kl": 0.0010903222137130797, + "learning_rate": 2.9056666666666666e-06, + "loss": 0.0001, + "num_tokens": 389341.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 23.796296296296298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012579447589814663, + "kl": 0.0007582803373225033, + "learning_rate": 2.9053333333333334e-06, + "loss": 0.0, + "num_tokens": 389613.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 23.814814814814813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15366342663764954, + "kl": 0.054549604654312134, + "learning_rate": 2.905e-06, + "loss": 0.0028, + "num_tokens": 389922.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 23.833333333333332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03424161672592163, + "kl": 0.0021857281972188503, + "learning_rate": 2.9046666666666665e-06, + "loss": 0.0001, + "num_tokens": 390233.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 39.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 39.5, + "completions/mean_terminated_length": 39.5, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 23.85185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.632739543914795, + "kl": 0.06287204567342997, + "learning_rate": 2.9043333333333337e-06, + "loss": 0.2307, + "num_tokens": 390643.0, + "reward": 4.375, + "reward_std": 3.902456521987915, + "rewards/reward_combined/mean": 4.375, + "rewards/reward_combined/std": 3.902456521987915, + "step": 1288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 23.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02854452282190323, + "kl": 0.010224854573607445, + "learning_rate": 2.904e-06, + "loss": 0.0005, + "num_tokens": 390945.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 23.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07448150217533112, + "kl": 0.0120579544454813, + "learning_rate": 2.903666666666667e-06, + "loss": 0.0006, + "num_tokens": 391288.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 23.90740740740741, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4822940826416016, + "kl": 0.1271328292787075, + "learning_rate": 2.903333333333333e-06, + "loss": -0.0378, + "num_tokens": 391586.0, + "reward": 4.0, + "reward_std": 2.915475845336914, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 2.915475845336914, + "step": 1291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 23.925925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05746985971927643, + "kl": 0.15698464959859848, + "learning_rate": 2.903e-06, + "loss": 0.0078, + "num_tokens": 391894.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.75, + "completions/mean_terminated_length": 11.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 23.944444444444443, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1779134273529053, + "kl": 0.010984099702909589, + "learning_rate": 2.9026666666666668e-06, + "loss": 0.0229, + "num_tokens": 392153.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 1293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.25, + "completions/mean_terminated_length": 6.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 23.962962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1667739897966385, + "kl": 0.011745782569050789, + "learning_rate": 2.9023333333333336e-06, + "loss": 0.0006, + "num_tokens": 392386.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 23.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017675377428531647, + "kl": 0.006357002770528197, + "learning_rate": 2.9020000000000003e-06, + "loss": 0.0003, + "num_tokens": 392658.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 24.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01812010630965233, + "kl": 0.003440248081460595, + "learning_rate": 2.9016666666666667e-06, + "loss": 0.0002, + "num_tokens": 392938.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 24.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07860206067562103, + "kl": 0.009696295484900475, + "learning_rate": 2.9013333333333335e-06, + "loss": 0.0005, + "num_tokens": 393222.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 24.037037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03346037492156029, + "kl": 0.004525701981037855, + "learning_rate": 2.901e-06, + "loss": 0.0002, + "num_tokens": 393494.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 24.055555555555557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7806396484375, + "kl": 0.0825969516299665, + "learning_rate": 2.9006666666666666e-06, + "loss": 0.0044, + "num_tokens": 393785.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 24.074074074074073, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.100828170776367, + "kl": 0.035566192818805575, + "learning_rate": 2.9003333333333334e-06, + "loss": -0.0454, + "num_tokens": 394051.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 1300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 24.09259259259259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0176968015730381, + "kl": 0.0011995251406915486, + "learning_rate": 2.9e-06, + "loss": 0.0001, + "num_tokens": 394353.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 24.11111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.3940958976745605, + "kl": 0.016883139964193106, + "learning_rate": 2.8996666666666665e-06, + "loss": -0.0027, + "num_tokens": 394687.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 1302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 24.12962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.309798002243042, + "kl": 0.10009177401661873, + "learning_rate": 2.8993333333333333e-06, + "loss": 0.0306, + "num_tokens": 395091.0, + "reward": 1.125, + "reward_std": 1.25, + "rewards/reward_combined/mean": 1.125, + "rewards/reward_combined/std": 1.25, + "step": 1303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.25, + "completions/mean_terminated_length": 6.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 24.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10385389626026154, + "kl": 0.004034673795104027, + "learning_rate": 2.899e-06, + "loss": 0.0002, + "num_tokens": 395324.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 24.166666666666668, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1049782857298851, + "kl": 0.002744565485045314, + "learning_rate": 2.898666666666667e-06, + "loss": 0.0001, + "num_tokens": 395540.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 24.185185185185187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22583812475204468, + "kl": 0.035681961104273796, + "learning_rate": 2.8983333333333337e-06, + "loss": 0.0019, + "num_tokens": 395871.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 24.203703703703702, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002493814332410693, + "kl": 0.0010850706021301448, + "learning_rate": 2.898e-06, + "loss": 0.0001, + "num_tokens": 396151.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 27.25, + "completions/mean_terminated_length": 27.25, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 24.22222222222222, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.346193313598633, + "kl": 0.09355531260371208, + "learning_rate": 2.897666666666667e-06, + "loss": 0.0463, + "num_tokens": 396488.0, + "reward": 4.625, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 1308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 24.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018391605408396572, + "kl": 0.00011413544416427612, + "learning_rate": 2.897333333333333e-06, + "loss": 0.0, + "num_tokens": 396708.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 1309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 24.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030608419328927994, + "kl": 0.0014388965792022645, + "learning_rate": 2.897e-06, + "loss": 0.0001, + "num_tokens": 397024.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 24.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.261711448431015, + "kl": 0.019011598080396652, + "learning_rate": 2.8966666666666667e-06, + "loss": 0.001, + "num_tokens": 397284.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 24.296296296296298, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.060579299926758, + "kl": 0.05696502886712551, + "learning_rate": 2.8963333333333335e-06, + "loss": 0.0357, + "num_tokens": 397560.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 1312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 24.314814814814813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04632307216525078, + "kl": 0.007019456010311842, + "learning_rate": 2.8960000000000003e-06, + "loss": 0.0004, + "num_tokens": 397898.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 24.333333333333332, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.684145927429199, + "kl": 0.019601126201450825, + "learning_rate": 2.8956666666666667e-06, + "loss": 0.218, + "num_tokens": 398204.0, + "reward": 4.625, + "reward_std": 4.308422088623047, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 4.308422088623047, + "step": 1314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 33.0, + "completions/mean_terminated_length": 33.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 24.35185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6183109283447266, + "kl": 0.07152040116488934, + "learning_rate": 2.8953333333333335e-06, + "loss": 0.0009, + "num_tokens": 398588.0, + "reward": 4.125, + "reward_std": 3.902456521987915, + "rewards/reward_combined/mean": 4.125, + "rewards/reward_combined/std": 3.902456521987915, + "step": 1315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 24.37037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 10.739463806152344, + "kl": 0.017192358151078224, + "learning_rate": 2.895e-06, + "loss": 0.1828, + "num_tokens": 398841.0, + "reward": 3.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 3.875, + "rewards/reward_combined/std": 0.25, + "step": 1316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 24.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006905599497258663, + "kl": 0.015825394541025162, + "learning_rate": 2.8946666666666666e-06, + "loss": 0.0008, + "num_tokens": 399101.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 24.40740740740741, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03127503767609596, + "kl": 0.00217806757427752, + "learning_rate": 2.8943333333333334e-06, + "loss": 0.0001, + "num_tokens": 399362.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 24.425925925925927, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.607936382293701, + "kl": 0.058576663956046104, + "learning_rate": 2.894e-06, + "loss": 0.1897, + "num_tokens": 399682.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 1319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 24.444444444444443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03317941352725029, + "kl": 0.004169125575572252, + "learning_rate": 2.893666666666667e-06, + "loss": 0.0002, + "num_tokens": 399958.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 24.462962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015253948979079723, + "kl": 0.08897051960229874, + "learning_rate": 2.8933333333333333e-06, + "loss": 0.0044, + "num_tokens": 400324.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.0, + "completions/max_terminated_length": 46.0, + "completions/mean_length": 35.75, + "completions/mean_terminated_length": 35.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 24.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2393220216035843, + "kl": 0.04741102457046509, + "learning_rate": 2.893e-06, + "loss": 0.0026, + "num_tokens": 400691.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 24.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4361139237880707, + "kl": 0.06529728788882494, + "learning_rate": 2.892666666666667e-06, + "loss": 0.0028, + "num_tokens": 400989.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 27.0, + "completions/mean_terminated_length": 27.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 24.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07573315501213074, + "kl": 0.012549018487334251, + "learning_rate": 2.8923333333333336e-06, + "loss": 0.0006, + "num_tokens": 401325.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 24.537037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003119494765996933, + "kl": 0.00016990544827422127, + "learning_rate": 2.892e-06, + "loss": 0.0, + "num_tokens": 401632.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 1325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 24.555555555555557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1930025815963745, + "kl": 0.0218666922301054, + "learning_rate": 2.891666666666667e-06, + "loss": 0.0011, + "num_tokens": 401888.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 32.25, + "completions/mean_terminated_length": 32.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 24.574074074074073, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.869344234466553, + "kl": 0.18673317320644855, + "learning_rate": 2.891333333333333e-06, + "loss": 0.2823, + "num_tokens": 402241.0, + "reward": 6.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 6.5, + "rewards/reward_combined/std": 2.0, + "step": 1327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 24.59259259259259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03775437921285629, + "kl": 0.000459328293800354, + "learning_rate": 2.891e-06, + "loss": 0.0, + "num_tokens": 402453.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 33.5, + "completions/mean_terminated_length": 33.5, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 24.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.794774055480957, + "kl": 0.08449650928378105, + "learning_rate": 2.8906666666666667e-06, + "loss": -0.0032, + "num_tokens": 402803.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 1329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 24.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005466298200190067, + "kl": 0.0009227111877407879, + "learning_rate": 2.8903333333333335e-06, + "loss": 0.0, + "num_tokens": 403063.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 24.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0044248453341424465, + "kl": 9.804964065551758e-05, + "learning_rate": 2.8900000000000003e-06, + "loss": 0.0, + "num_tokens": 403283.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 24.666666666666668, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004193097818642855, + "kl": 0.0031790658831596375, + "learning_rate": 2.8896666666666666e-06, + "loss": 0.0002, + "num_tokens": 403519.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 24.685185185185187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03349853679537773, + "kl": 0.004807890392839909, + "learning_rate": 2.8893333333333334e-06, + "loss": 0.0002, + "num_tokens": 403831.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 24.703703703703702, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08382602035999298, + "kl": 0.03187142685055733, + "learning_rate": 2.8889999999999998e-06, + "loss": 0.0016, + "num_tokens": 404127.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 27.75, + "completions/mean_terminated_length": 27.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 24.72222222222222, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.491166591644287, + "kl": 0.06389089673757553, + "learning_rate": 2.888666666666667e-06, + "loss": -0.0448, + "num_tokens": 404474.0, + "reward": 1.5, + "reward_std": 2.2730302810668945, + "rewards/reward_combined/mean": 1.5, + "rewards/reward_combined/std": 2.2730302810668945, + "step": 1335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.75, + "completions/mean_terminated_length": 12.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 24.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21916526556015015, + "kl": 0.0208267355337739, + "learning_rate": 2.8883333333333333e-06, + "loss": 0.001, + "num_tokens": 404737.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 24.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18290232121944427, + "kl": 0.013282960280776024, + "learning_rate": 2.888e-06, + "loss": 0.0007, + "num_tokens": 405063.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 24.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6394146680831909, + "kl": 0.06432866025716066, + "learning_rate": 2.887666666666667e-06, + "loss": 0.0043, + "num_tokens": 405340.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 24.796296296296298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16449198126792908, + "kl": 0.0064424017909914255, + "learning_rate": 2.8873333333333333e-06, + "loss": 0.0003, + "num_tokens": 405567.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 24.814814814814813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.41378939151763916, + "kl": 0.05045641399919987, + "learning_rate": 2.887e-06, + "loss": 0.0025, + "num_tokens": 405858.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 24.833333333333332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04150913283228874, + "kl": 0.008626286871731281, + "learning_rate": 2.886666666666667e-06, + "loss": 0.0004, + "num_tokens": 406132.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 24.85185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.189365863800049, + "kl": 0.15324417501688004, + "learning_rate": 2.8863333333333336e-06, + "loss": -0.0299, + "num_tokens": 406463.0, + "reward": 2.875, + "reward_std": 3.8810436725616455, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 3.8810436725616455, + "step": 1342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 24.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11873263120651245, + "kl": 0.012811253778636456, + "learning_rate": 2.886e-06, + "loss": 0.0007, + "num_tokens": 406751.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 24.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02632307820022106, + "kl": 0.005434486083686352, + "learning_rate": 2.8856666666666668e-06, + "loss": 0.0003, + "num_tokens": 407021.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 24.90740740740741, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027666503563523293, + "kl": 0.0009456351399421692, + "learning_rate": 2.885333333333333e-06, + "loss": 0.0, + "num_tokens": 407229.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 24.925925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04538443684577942, + "kl": 0.008396757300943136, + "learning_rate": 2.885e-06, + "loss": 0.0004, + "num_tokens": 407517.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 24.944444444444443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02309548854827881, + "kl": 0.0010463881844771095, + "learning_rate": 2.8846666666666667e-06, + "loss": 0.0001, + "num_tokens": 407831.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.008771929889917374, + "clip_ratio/low_min": 0.008771929889917374, + "clip_ratio/region_mean": 0.008771929889917374, + "completion_length": 29.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 29.0, + "completions/mean_terminated_length": 29.0, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 24.962962962962962, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.706453800201416, + "kl": 0.07297445461153984, + "learning_rate": 2.8843333333333335e-06, + "loss": 0.0943, + "num_tokens": 408167.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 1348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 24.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04379303380846977, + "kl": 0.004015351412817836, + "learning_rate": 2.8840000000000003e-06, + "loss": 0.0002, + "num_tokens": 408439.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 25.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.96112060546875, + "kl": 0.0038274761172942817, + "learning_rate": 2.8836666666666666e-06, + "loss": 0.0808, + "num_tokens": 408741.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 1350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 25.01851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.13623046875, + "kl": 0.01971262990264222, + "learning_rate": 2.8833333333333334e-06, + "loss": 0.2748, + "num_tokens": 409021.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 1351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 25.037037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0038380788173526525, + "kl": 0.00040895864367485046, + "learning_rate": 2.883e-06, + "loss": 0.0, + "num_tokens": 409265.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 25.055555555555557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10849466174840927, + "kl": 0.013719635549932718, + "learning_rate": 2.882666666666667e-06, + "loss": 0.0006, + "num_tokens": 409563.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 25.074074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004174836911261082, + "kl": 0.016189759597182274, + "learning_rate": 2.8823333333333333e-06, + "loss": 0.0008, + "num_tokens": 409823.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 25.09259259259259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017169617116451263, + "kl": 0.0010407405789010227, + "learning_rate": 2.882e-06, + "loss": 0.0001, + "num_tokens": 410085.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 25.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003414747305214405, + "kl": 0.0009467572235735133, + "learning_rate": 2.881666666666667e-06, + "loss": 0.0, + "num_tokens": 410304.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 25.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.9151815176010132, + "kl": 0.1177082397043705, + "learning_rate": 2.8813333333333332e-06, + "loss": 0.0059, + "num_tokens": 410608.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 25.14814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.594345569610596, + "kl": 0.03394792787730694, + "learning_rate": 2.881e-06, + "loss": 0.0302, + "num_tokens": 410901.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 1358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 25.166666666666668, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.6811065673828125, + "kl": 0.03273935429751873, + "learning_rate": 2.880666666666667e-06, + "loss": 0.1046, + "num_tokens": 411205.0, + "reward": 5.875, + "reward_std": 3.25, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 3.25, + "step": 1359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 25.185185185185187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056551720947027206, + "kl": 0.0023839278146624565, + "learning_rate": 2.8803333333333336e-06, + "loss": 0.0001, + "num_tokens": 411512.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 25.203703703703702, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004546754993498325, + "kl": 0.0031198635697364807, + "learning_rate": 2.88e-06, + "loss": 0.0002, + "num_tokens": 411748.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 45.0, + "completions/max_terminated_length": 45.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 25.22222222222222, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.902839183807373, + "kl": 0.04041749658063054, + "learning_rate": 2.8796666666666667e-06, + "loss": 0.1691, + "num_tokens": 412098.0, + "reward": 7.375, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.375, + "rewards/reward_combined/std": 0.25, + "step": 1362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 25.24074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.713037490844727, + "kl": 0.015319585800170898, + "learning_rate": 2.879333333333333e-06, + "loss": 0.102, + "num_tokens": 412362.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 1363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 25.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16719676554203033, + "kl": 0.002106286585330963, + "learning_rate": 2.879e-06, + "loss": 0.0001, + "num_tokens": 412574.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 33.75, + "completions/mean_terminated_length": 33.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 25.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3861662745475769, + "kl": 0.08083298802375793, + "learning_rate": 2.878666666666667e-06, + "loss": 0.0041, + "num_tokens": 412933.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 32.0, + "completions/mean_terminated_length": 32.0, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 25.296296296296298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3336963653564453, + "kl": 0.10030456259846687, + "learning_rate": 2.8783333333333334e-06, + "loss": 0.005, + "num_tokens": 413277.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 25.314814814814813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10147012025117874, + "kl": 0.027383566834032536, + "learning_rate": 2.8780000000000002e-06, + "loss": 0.0014, + "num_tokens": 413569.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 25.333333333333332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4869394302368164, + "kl": 0.07689347770065069, + "learning_rate": 2.8776666666666666e-06, + "loss": 0.0031, + "num_tokens": 413892.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 25.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10416198521852493, + "kl": 0.0032396416645497084, + "learning_rate": 2.8773333333333334e-06, + "loss": 0.0002, + "num_tokens": 414108.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 25.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13489079475402832, + "kl": 0.008247362682595849, + "learning_rate": 2.877e-06, + "loss": 0.0004, + "num_tokens": 414374.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 12.75, + "completions/mean_terminated_length": 12.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 25.38888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 10.903152465820312, + "kl": 0.02432501211296767, + "learning_rate": 2.876666666666667e-06, + "loss": 0.1839, + "num_tokens": 414645.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 1371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 25.40740740740741, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0227950811386108, + "kl": 0.09256695955991745, + "learning_rate": 2.8763333333333333e-06, + "loss": 0.005, + "num_tokens": 414925.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 25.425925925925927, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.019451141357422, + "kl": 0.08019421622157097, + "learning_rate": 2.876e-06, + "loss": -0.0647, + "num_tokens": 415204.0, + "reward": 4.5, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 4.041451930999756, + "step": 1373 + }, + { + "clip_ratio/high_max": 0.009433962404727936, + "clip_ratio/high_mean": 0.009433962404727936, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009433962404727936, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 25.444444444444443, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.732794761657715, + "kl": 0.06344223394989967, + "learning_rate": 2.875666666666667e-06, + "loss": 0.0736, + "num_tokens": 415518.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 1374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 25.462962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1445167064666748, + "kl": 0.027428697794675827, + "learning_rate": 2.8753333333333332e-06, + "loss": 0.0017, + "num_tokens": 415827.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 25.48148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.665595054626465, + "kl": 0.012714105658233166, + "learning_rate": 2.875e-06, + "loss": -0.015, + "num_tokens": 416112.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 1376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 25.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.245586395263672, + "kl": 0.011372138047590852, + "learning_rate": 2.8746666666666668e-06, + "loss": 0.08, + "num_tokens": 416438.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 1377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 25.51851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0409581661224365, + "kl": 0.0564742386341095, + "learning_rate": 2.8743333333333336e-06, + "loss": -0.0114, + "num_tokens": 416739.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 1378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 24.75, + "completions/mean_terminated_length": 24.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 25.537037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.3304027318954468, + "kl": 0.1141563281416893, + "learning_rate": 2.874e-06, + "loss": 0.006, + "num_tokens": 417066.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 25.555555555555557, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.342610836029053, + "kl": 0.09788231551647186, + "learning_rate": 2.8736666666666667e-06, + "loss": 0.05, + "num_tokens": 417381.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 1380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 25.574074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11588838696479797, + "kl": 0.00980787631124258, + "learning_rate": 2.873333333333333e-06, + "loss": 0.0005, + "num_tokens": 417671.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 25.59259259259259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022172139957547188, + "kl": 0.0008865445852279663, + "learning_rate": 2.8730000000000003e-06, + "loss": 0.0, + "num_tokens": 417881.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 25.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06228451803326607, + "kl": 0.005677401786670089, + "learning_rate": 2.872666666666667e-06, + "loss": 0.0003, + "num_tokens": 418192.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 25.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01638312265276909, + "kl": 0.0003984219874837436, + "learning_rate": 2.8723333333333334e-06, + "loss": 0.0, + "num_tokens": 418500.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 1384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 25.64814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.236784934997559, + "kl": 0.06469046536949463, + "learning_rate": 2.872e-06, + "loss": 0.1034, + "num_tokens": 418810.0, + "reward": 5.5, + "reward_std": 4.6726155281066895, + "rewards/reward_combined/mean": 5.5, + "rewards/reward_combined/std": 4.6726155281066895, + "step": 1385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 25.666666666666668, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045878250151872635, + "kl": 0.00161789043340832, + "learning_rate": 2.8716666666666666e-06, + "loss": 0.0001, + "num_tokens": 419088.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 72.5, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 72.5, + "completions/mean_terminated_length": 11.333333969116211, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 25.685185185185187, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2424540519714355, + "kl": 0.008048801682889462, + "learning_rate": 2.8713333333333333e-06, + "loss": 0.4699, + "num_tokens": 419598.0, + "reward": 7.0, + "reward_std": 1.0, + "rewards/reward_combined/mean": 7.0, + "rewards/reward_combined/std": 1.0, + "step": 1387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 25.703703703703702, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9747731685638428, + "kl": 0.0548938550055027, + "learning_rate": 2.871e-06, + "loss": -0.0055, + "num_tokens": 420002.0, + "reward": 2.375, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 1.25, + "step": 1388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 25.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10652562975883484, + "kl": 0.012743937084451318, + "learning_rate": 2.870666666666667e-06, + "loss": 0.0006, + "num_tokens": 420282.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 25.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012463857419788837, + "kl": 0.0007106041011866182, + "learning_rate": 2.8703333333333333e-06, + "loss": 0.0, + "num_tokens": 420554.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 25.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07210277020931244, + "kl": 0.009725909680128098, + "learning_rate": 2.87e-06, + "loss": 0.0005, + "num_tokens": 420897.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 25.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02645871229469776, + "kl": 0.0010656331141944975, + "learning_rate": 2.869666666666667e-06, + "loss": 0.0001, + "num_tokens": 421132.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 25.796296296296298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018604299053549767, + "kl": 0.09517047926783562, + "learning_rate": 2.869333333333333e-06, + "loss": 0.0048, + "num_tokens": 421496.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 25.814814814814813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.39529165625572205, + "kl": 0.025301030604168773, + "learning_rate": 2.869e-06, + "loss": 0.0014, + "num_tokens": 421794.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 25.833333333333332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017190895974636078, + "kl": 0.0019027739763259888, + "learning_rate": 2.8686666666666668e-06, + "loss": 0.0001, + "num_tokens": 422010.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 25.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06255856901407242, + "kl": 0.008704939857125282, + "learning_rate": 2.8683333333333335e-06, + "loss": 0.0004, + "num_tokens": 422294.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 28.75, + "completions/mean_terminated_length": 28.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 25.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2053355574607849, + "kl": 0.025588180869817734, + "learning_rate": 2.868e-06, + "loss": 0.0014, + "num_tokens": 422637.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 25.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007869244553148746, + "kl": 0.0044186601880937815, + "learning_rate": 2.8676666666666667e-06, + "loss": 0.0002, + "num_tokens": 422905.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 25.90740740740741, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022038500756025314, + "kl": 0.0031958511099219322, + "learning_rate": 2.867333333333333e-06, + "loss": 0.0002, + "num_tokens": 423217.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 25.925925925925927, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2725090980529785, + "kl": 0.057315390557050705, + "learning_rate": 2.8670000000000002e-06, + "loss": 0.0994, + "num_tokens": 423554.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 1400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 25.944444444444443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002391482557868585, + "kl": 0.00010408461093902588, + "learning_rate": 2.866666666666667e-06, + "loss": 0.0, + "num_tokens": 423774.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 1401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 29.75, + "completions/mean_terminated_length": 29.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 25.962962962962962, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4480767250061035, + "kl": 0.02600930631160736, + "learning_rate": 2.8663333333333334e-06, + "loss": 0.0394, + "num_tokens": 424129.0, + "reward": 3.875, + "reward_std": 2.688710927963257, + "rewards/reward_combined/mean": 3.875, + "rewards/reward_combined/std": 2.688710927963257, + "step": 1402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 25.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11902312189340591, + "kl": 0.013970667496323586, + "learning_rate": 2.866e-06, + "loss": 0.0007, + "num_tokens": 424431.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 26.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12946157157421112, + "kl": 0.02108490839600563, + "learning_rate": 2.8656666666666665e-06, + "loss": 0.0011, + "num_tokens": 424703.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 29.0, + "completions/mean_terminated_length": 29.0, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 26.01851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.329207420349121, + "kl": 0.08236123993992805, + "learning_rate": 2.8653333333333333e-06, + "loss": -0.1037, + "num_tokens": 425047.0, + "reward": 7.625, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.625, + "rewards/reward_combined/std": 0.25, + "step": 1405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 26.037037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02120780199766159, + "kl": 0.003914693836122751, + "learning_rate": 2.865e-06, + "loss": 0.0002, + "num_tokens": 425383.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 26.055555555555557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03202745318412781, + "kl": 0.0058736191131174564, + "learning_rate": 2.864666666666667e-06, + "loss": 0.0003, + "num_tokens": 425689.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 26.074074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018070092424750328, + "kl": 0.0010024543153122067, + "learning_rate": 2.8643333333333332e-06, + "loss": 0.0001, + "num_tokens": 426010.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 26.09259259259259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11457064002752304, + "kl": 0.013781537534669042, + "learning_rate": 2.864e-06, + "loss": 0.0007, + "num_tokens": 426340.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 26.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006577032618224621, + "kl": 0.01581774465739727, + "learning_rate": 2.863666666666667e-06, + "loss": 0.0008, + "num_tokens": 426600.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 26.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11055237799882889, + "kl": 0.01578545314259827, + "learning_rate": 2.863333333333333e-06, + "loss": 0.0008, + "num_tokens": 426872.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 26.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09204570204019547, + "kl": 0.005240729544311762, + "learning_rate": 2.8630000000000004e-06, + "loss": 0.0002, + "num_tokens": 427126.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 26.166666666666668, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002377243945375085, + "kl": 0.00010420382022857666, + "learning_rate": 2.8626666666666667e-06, + "loss": 0.0, + "num_tokens": 427346.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 1413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 50.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 119.0, + "completions/max_terminated_length": 119.0, + "completions/mean_length": 50.75, + "completions/mean_terminated_length": 50.75, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 26.185185185185187, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6935038566589355, + "kl": 0.05672331899404526, + "learning_rate": 2.8623333333333335e-06, + "loss": -0.2265, + "num_tokens": 427785.0, + "reward": 4.0, + "reward_std": 2.915475845336914, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 2.915475845336914, + "step": 1414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 26.203703703703702, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006571251433342695, + "kl": 0.0027382224798202515, + "learning_rate": 2.862e-06, + "loss": 0.0001, + "num_tokens": 428021.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 24.75, + "completions/mean_terminated_length": 24.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 26.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24043260514736176, + "kl": 0.018282222794368863, + "learning_rate": 2.8616666666666667e-06, + "loss": 0.0009, + "num_tokens": 428320.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1416 + }, + { + "clip_ratio/high_max": 0.008064515888690948, + "clip_ratio/high_mean": 0.008064515888690948, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008064515888690948, + "completion_length": 37.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 37.25, + "completions/mean_terminated_length": 37.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 26.24074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4426515102386475, + "kl": 0.02955270535312593, + "learning_rate": 2.8613333333333334e-06, + "loss": -0.0004, + "num_tokens": 428749.0, + "reward": 2.5999999046325684, + "reward_std": 0.4618801772594452, + "rewards/reward_combined/mean": 2.5999999046325684, + "rewards/reward_combined/std": 0.4618801772594452, + "step": 1417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 27.0, + "completions/mean_terminated_length": 27.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 26.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11867247521877289, + "kl": 0.010655163321644068, + "learning_rate": 2.8610000000000002e-06, + "loss": 0.0007, + "num_tokens": 429097.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 26.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.604410707950592, + "kl": 0.04736524447798729, + "learning_rate": 2.860666666666667e-06, + "loss": 0.0024, + "num_tokens": 429357.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.5, + "completions/mean_terminated_length": 4.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 26.296296296296298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16216279566287994, + "kl": 0.0031636282801628113, + "learning_rate": 2.8603333333333334e-06, + "loss": 0.0002, + "num_tokens": 429571.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 26.314814814814813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09082437306642532, + "kl": 0.01961122266948223, + "learning_rate": 2.86e-06, + "loss": 0.001, + "num_tokens": 429867.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 26.333333333333332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10625924915075302, + "kl": 0.16769862174987793, + "learning_rate": 2.8596666666666665e-06, + "loss": 0.0084, + "num_tokens": 430179.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.5, + "completions/mean_terminated_length": 5.5, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 26.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04638618603348732, + "kl": 0.0008038555170060135, + "learning_rate": 2.8593333333333333e-06, + "loss": 0.0, + "num_tokens": 430401.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 26.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11778359115123749, + "kl": 0.019276143983006477, + "learning_rate": 2.859e-06, + "loss": 0.001, + "num_tokens": 430675.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.5, + "completions/mean_terminated_length": 4.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 26.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3700369894504547, + "kl": 0.01605492690578103, + "learning_rate": 2.858666666666667e-06, + "loss": 0.0008, + "num_tokens": 430889.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 26.40740740740741, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021307064220309258, + "kl": 0.0946941003203392, + "learning_rate": 2.8583333333333332e-06, + "loss": 0.0047, + "num_tokens": 431253.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 26.425925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07501452416181564, + "kl": 0.009454557904973626, + "learning_rate": 2.858e-06, + "loss": 0.0005, + "num_tokens": 431545.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 26.444444444444443, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2175190448760986, + "kl": 0.033329762518405914, + "learning_rate": 2.8576666666666668e-06, + "loss": 0.0431, + "num_tokens": 431836.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 1428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 26.462962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03348877653479576, + "kl": 0.0033172527328133583, + "learning_rate": 2.8573333333333336e-06, + "loss": 0.0002, + "num_tokens": 432148.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 26.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07658609747886658, + "kl": 0.05102025344967842, + "learning_rate": 2.8570000000000003e-06, + "loss": 0.0026, + "num_tokens": 432446.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 26.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03014654852449894, + "kl": 0.004795238608494401, + "learning_rate": 2.8566666666666667e-06, + "loss": 0.0002, + "num_tokens": 432735.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 26.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11690176278352737, + "kl": 0.0044711134396493435, + "learning_rate": 2.8563333333333335e-06, + "loss": 0.0002, + "num_tokens": 432955.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 26.537037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03666602075099945, + "kl": 0.0011766999959945679, + "learning_rate": 2.856e-06, + "loss": 0.0001, + "num_tokens": 433199.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 26.555555555555557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0306557547301054, + "kl": 0.0009726583957672119, + "learning_rate": 2.8556666666666666e-06, + "loss": 0.0, + "num_tokens": 433455.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 26.574074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1592872142791748, + "kl": 0.06260843947529793, + "learning_rate": 2.8553333333333334e-06, + "loss": 0.0031, + "num_tokens": 433757.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 26.59259259259259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.9480538964271545, + "kl": 0.061230313032865524, + "learning_rate": 2.855e-06, + "loss": 0.0031, + "num_tokens": 434019.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 26.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.19640588760376, + "kl": 0.023320306092500687, + "learning_rate": 2.854666666666667e-06, + "loss": 0.1597, + "num_tokens": 434301.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 1437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 26.62962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.577958583831787, + "kl": 0.007525532506406307, + "learning_rate": 2.8543333333333333e-06, + "loss": 0.0031, + "num_tokens": 434599.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 1438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 26.64814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.988666534423828, + "kl": 0.03583371452987194, + "learning_rate": 2.854e-06, + "loss": 0.0964, + "num_tokens": 434919.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 1439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 26.666666666666668, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.915569305419922, + "kl": 0.030587024171836674, + "learning_rate": 2.8536666666666665e-06, + "loss": 0.0895, + "num_tokens": 435158.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 1440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 84.75, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 84.75, + "completions/mean_terminated_length": 27.666667938232422, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 26.685185185185187, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9425950050354004, + "kl": 0.006271434482187033, + "learning_rate": 2.8533333333333333e-06, + "loss": 0.3834, + "num_tokens": 435725.0, + "reward": 5.300000190734863, + "reward_std": 5.399999618530273, + "rewards/reward_combined/mean": 5.300000190734863, + "rewards/reward_combined/std": 5.40000057220459, + "step": 1441 + }, + { + "clip_ratio/high_max": 0.013513513840734959, + "clip_ratio/high_mean": 0.013513513840734959, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.013513513840734959, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 26.703703703703702, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.355051517486572, + "kl": 0.011743251234292984, + "learning_rate": 2.853e-06, + "loss": 0.1324, + "num_tokens": 436030.0, + "reward": 6.25, + "reward_std": 2.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 2.5, + "step": 1442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 26.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06819178909063339, + "kl": 0.0028445011121220887, + "learning_rate": 2.852666666666667e-06, + "loss": 0.0001, + "num_tokens": 436300.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 26.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034059494733810425, + "kl": 0.002107149106450379, + "learning_rate": 2.8523333333333336e-06, + "loss": 0.0001, + "num_tokens": 436562.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 26.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053593143820762634, + "kl": 0.006958060432225466, + "learning_rate": 2.852e-06, + "loss": 0.0003, + "num_tokens": 436891.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 26.77777777777778, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.85930061340332, + "kl": 0.055007945746183395, + "learning_rate": 2.8516666666666668e-06, + "loss": 0.0497, + "num_tokens": 437187.0, + "reward": 5.25, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 2.598076105117798, + "step": 1446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 26.796296296296298, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4209827184677124, + "kl": 0.006550558842718601, + "learning_rate": 2.8513333333333335e-06, + "loss": -0.0444, + "num_tokens": 437517.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 1447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 26.814814814814813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053395211696624756, + "kl": 0.0027509289793670177, + "learning_rate": 2.8510000000000003e-06, + "loss": 0.0001, + "num_tokens": 437781.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1448 + }, + { + "clip_ratio/high_max": 0.009259259328246117, + "clip_ratio/high_mean": 0.009259259328246117, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009259259328246117, + "completion_length": 28.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 28.5, + "completions/mean_terminated_length": 28.5, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 26.833333333333332, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.038480758666992, + "kl": 0.08235517889261246, + "learning_rate": 2.8506666666666667e-06, + "loss": -0.0701, + "num_tokens": 438111.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 1449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 26.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007691516540944576, + "kl": 0.0002653861738508567, + "learning_rate": 2.8503333333333335e-06, + "loss": 0.0, + "num_tokens": 438419.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 1450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 26.87037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.201418399810791, + "kl": 0.0898846909403801, + "learning_rate": 2.85e-06, + "loss": 0.0233, + "num_tokens": 438754.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 1451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 26.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031339455395936966, + "kl": 0.0016578052891418338, + "learning_rate": 2.8496666666666666e-06, + "loss": 0.0001, + "num_tokens": 439030.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 32.0, + "completions/mean_terminated_length": 32.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 26.90740740740741, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18257854878902435, + "kl": 0.05395127087831497, + "learning_rate": 2.8493333333333334e-06, + "loss": 0.0026, + "num_tokens": 439382.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 26.925925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14778709411621094, + "kl": 0.022392653860151768, + "learning_rate": 2.849e-06, + "loss": 0.0011, + "num_tokens": 439656.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 26.944444444444443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0020430092699825764, + "kl": 0.0010884659131988883, + "learning_rate": 2.848666666666667e-06, + "loss": 0.0001, + "num_tokens": 439936.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 26.962962962962962, + "frac_reward_zero_std": 0.0, + "grad_norm": 10.953999519348145, + "kl": 0.020798705518245697, + "learning_rate": 2.8483333333333333e-06, + "loss": 0.0803, + "num_tokens": 440213.0, + "reward": 5.875, + "reward_std": 3.25, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 3.25, + "step": 1456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 26.98148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.174964427947998, + "kl": 0.008955185767263174, + "learning_rate": 2.848e-06, + "loss": 0.0011, + "num_tokens": 440517.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 1457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 27.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023876987397670746, + "kl": 0.004471091320738196, + "learning_rate": 2.8476666666666665e-06, + "loss": 0.0002, + "num_tokens": 440801.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 27.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04079962894320488, + "kl": 0.001997561543248594, + "learning_rate": 2.8473333333333337e-06, + "loss": 0.0001, + "num_tokens": 441108.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 1459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 27.037037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.3792517185211182, + "kl": 0.12301432993263006, + "learning_rate": 2.847e-06, + "loss": 0.0062, + "num_tokens": 441404.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.25, + "completions/mean_terminated_length": 6.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 27.055555555555557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037380483001470566, + "kl": 0.0010350601805839688, + "learning_rate": 2.846666666666667e-06, + "loss": 0.0001, + "num_tokens": 441637.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 29.5, + "completions/mean_terminated_length": 29.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 27.074074074074073, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.035538196563721, + "kl": 0.13028479367494583, + "learning_rate": 2.8463333333333336e-06, + "loss": 0.058, + "num_tokens": 441971.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 1462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 27.09259259259259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08378343284130096, + "kl": 0.045418718829751015, + "learning_rate": 2.846e-06, + "loss": 0.0023, + "num_tokens": 442273.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 27.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02732064388692379, + "kl": 0.0015994884306564927, + "learning_rate": 2.8456666666666667e-06, + "loss": 0.0001, + "num_tokens": 442535.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 27.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1321711540222168, + "kl": 0.039184389635920525, + "learning_rate": 2.8453333333333335e-06, + "loss": 0.002, + "num_tokens": 442831.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 27.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005658379755914211, + "kl": 0.0014997664839029312, + "learning_rate": 2.8450000000000003e-06, + "loss": 0.0001, + "num_tokens": 443143.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 27.166666666666668, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07467425614595413, + "kl": 0.01969664730131626, + "learning_rate": 2.8446666666666666e-06, + "loss": 0.001, + "num_tokens": 443465.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 27.185185185185187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08769779652357101, + "kl": 0.01807217695750296, + "learning_rate": 2.8443333333333334e-06, + "loss": 0.0009, + "num_tokens": 443751.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 27.203703703703702, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03231378272175789, + "kl": 0.004902600310742855, + "learning_rate": 2.844e-06, + "loss": 0.0002, + "num_tokens": 444056.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 27.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08490697294473648, + "kl": 0.017958096228539944, + "learning_rate": 2.8436666666666666e-06, + "loss": 0.0009, + "num_tokens": 444383.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 27.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018972262740135193, + "kl": 0.09514028578996658, + "learning_rate": 2.8433333333333334e-06, + "loss": 0.0048, + "num_tokens": 444747.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.5, + "completions/mean_terminated_length": 4.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 27.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058594103902578354, + "kl": 0.0005277901946101338, + "learning_rate": 2.843e-06, + "loss": 0.0, + "num_tokens": 444961.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 27.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.33628398180007935, + "kl": 0.04409374576061964, + "learning_rate": 2.842666666666667e-06, + "loss": 0.0022, + "num_tokens": 445287.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 27.296296296296298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14954088628292084, + "kl": 0.006559070199728012, + "learning_rate": 2.8423333333333333e-06, + "loss": 0.0003, + "num_tokens": 445547.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 27.314814814814813, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.589659214019775, + "kl": 0.05561095476150513, + "learning_rate": 2.842e-06, + "loss": -0.037, + "num_tokens": 445834.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 1475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 27.333333333333332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012504206039011478, + "kl": 0.008804184384644032, + "learning_rate": 2.8416666666666664e-06, + "loss": 0.0004, + "num_tokens": 446106.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 27.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09010161459445953, + "kl": 0.003900033188983798, + "learning_rate": 2.8413333333333336e-06, + "loss": 0.0002, + "num_tokens": 446370.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 27.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00025419285520911217, + "kl": 0.00010056048631668091, + "learning_rate": 2.841e-06, + "loss": 0.0, + "num_tokens": 446590.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 1478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 27.38888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.483871459960938, + "kl": 0.028468238189816475, + "learning_rate": 2.8406666666666668e-06, + "loss": -0.0025, + "num_tokens": 446862.0, + "reward": 4.5, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 4.041451930999756, + "step": 1479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 26.75, + "completions/mean_terminated_length": 26.75, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 27.40740740740741, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8909201622009277, + "kl": 0.06551635637879372, + "learning_rate": 2.8403333333333336e-06, + "loss": 0.0045, + "num_tokens": 447197.0, + "reward": 4.0, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 4.041451930999756, + "step": 1480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 27.425925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019390955567359924, + "kl": 0.008523456286638975, + "learning_rate": 2.84e-06, + "loss": 0.0004, + "num_tokens": 447465.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 27.444444444444443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002513324609026313, + "kl": 0.0009862482838798314, + "learning_rate": 2.8396666666666667e-06, + "loss": 0.0, + "num_tokens": 447745.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 28.25, + "completions/mean_terminated_length": 28.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 27.462962962962962, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6288270950317383, + "kl": 0.05403287336230278, + "learning_rate": 2.8393333333333335e-06, + "loss": 0.1173, + "num_tokens": 448094.0, + "reward": 4.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 2.0, + "step": 1483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 27.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0399010106921196, + "kl": 0.003508269786834717, + "learning_rate": 2.8390000000000003e-06, + "loss": 0.0002, + "num_tokens": 448306.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 27.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1594657003879547, + "kl": 0.01845983834937215, + "learning_rate": 2.8386666666666666e-06, + "loss": 0.0009, + "num_tokens": 448606.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 27.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020616723224520683, + "kl": 0.005141196423210204, + "learning_rate": 2.8383333333333334e-06, + "loss": 0.0003, + "num_tokens": 448897.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 27.537037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6328845620155334, + "kl": 0.07485029846429825, + "learning_rate": 2.8379999999999998e-06, + "loss": 0.0037, + "num_tokens": 449157.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 34.25, + "completions/mean_terminated_length": 34.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 27.555555555555557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12431022524833679, + "kl": 0.041104961186647415, + "learning_rate": 2.8376666666666665e-06, + "loss": 0.002, + "num_tokens": 449574.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 1488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 27.574074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006374900694936514, + "kl": 0.0027919337153434753, + "learning_rate": 2.8373333333333338e-06, + "loss": 0.0001, + "num_tokens": 449810.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 32.5, + "completions/mean_terminated_length": 32.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 27.59259259259259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0332266241312027, + "kl": 0.02783461380749941, + "learning_rate": 2.837e-06, + "loss": 0.0014, + "num_tokens": 450164.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 27.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20492404699325562, + "kl": 0.05932088941335678, + "learning_rate": 2.836666666666667e-06, + "loss": 0.0029, + "num_tokens": 450464.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.5, + "completions/mean_terminated_length": 4.5, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 27.62962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 12.562373161315918, + "kl": 0.028178778651636094, + "learning_rate": 2.8363333333333333e-06, + "loss": -0.1328, + "num_tokens": 450682.0, + "reward": 3.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 3.875, + "rewards/reward_combined/std": 0.25, + "step": 1492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 27.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03149878606200218, + "kl": 0.005483974236994982, + "learning_rate": 2.836e-06, + "loss": 0.0003, + "num_tokens": 450971.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.5, + "completions/mean_terminated_length": 4.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 27.666666666666668, + "frac_reward_zero_std": 0.0, + "grad_norm": 26.841873168945312, + "kl": 4.637267604470253, + "learning_rate": 2.835666666666667e-06, + "loss": 0.3793, + "num_tokens": 451189.0, + "reward": 2.5, + "reward_std": 3.0, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 3.0, + "step": 1494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 7.0, + "completions/mean_terminated_length": 7.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 27.685185185185187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08196630328893661, + "kl": 0.003600056399591267, + "learning_rate": 2.8353333333333336e-06, + "loss": 0.0002, + "num_tokens": 451437.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 27.703703703703702, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5261520147323608, + "kl": 0.1114635318517685, + "learning_rate": 2.835e-06, + "loss": 0.0056, + "num_tokens": 451731.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 27.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04770840331912041, + "kl": 0.005972094601020217, + "learning_rate": 2.8346666666666667e-06, + "loss": 0.0003, + "num_tokens": 452013.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 27.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1798420548439026, + "kl": 0.01135201659053564, + "learning_rate": 2.8343333333333335e-06, + "loss": 0.0005, + "num_tokens": 452256.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 27.75925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.939312934875488, + "kl": 0.019981331191956997, + "learning_rate": 2.834e-06, + "loss": 0.0506, + "num_tokens": 452602.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 1499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 27.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.25883594155311584, + "kl": 0.01398910884745419, + "learning_rate": 2.8336666666666667e-06, + "loss": 0.0007, + "num_tokens": 452858.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 27.796296296296298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043560244143009186, + "kl": 0.004490455146878958, + "learning_rate": 2.8333333333333335e-06, + "loss": 0.0002, + "num_tokens": 453192.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 27.814814814814813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01911994256079197, + "kl": 0.000843496760353446, + "learning_rate": 2.8330000000000002e-06, + "loss": 0.0, + "num_tokens": 453509.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1502 + }, + { + "clip_ratio/high_max": 0.014285714365541935, + "clip_ratio/high_mean": 0.014285714365541935, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.014285714365541935, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 27.833333333333332, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.820046901702881, + "kl": 0.11318179219961166, + "learning_rate": 2.8326666666666666e-06, + "loss": -0.162, + "num_tokens": 453816.0, + "reward": 2.375, + "reward_std": 1.8874585628509521, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 1.8874585628509521, + "step": 1503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 27.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07642263919115067, + "kl": 0.010808147490024567, + "learning_rate": 2.8323333333333334e-06, + "loss": 0.0006, + "num_tokens": 454088.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 27.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006149383261799812, + "kl": 0.015830593183636665, + "learning_rate": 2.8319999999999997e-06, + "loss": 0.0008, + "num_tokens": 454348.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 27.88888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.720034599304199, + "kl": 0.07616878300905228, + "learning_rate": 2.831666666666667e-06, + "loss": 0.0938, + "num_tokens": 454684.0, + "reward": 6.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 6.5, + "rewards/reward_combined/std": 2.0, + "step": 1506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 27.90740740740741, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05332932248711586, + "kl": 0.1629938930273056, + "learning_rate": 2.8313333333333337e-06, + "loss": 0.0082, + "num_tokens": 454993.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 27.925925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04099570959806442, + "kl": 0.0013543331297114491, + "learning_rate": 2.831e-06, + "loss": 0.0001, + "num_tokens": 455269.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 27.944444444444443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038438763469457626, + "kl": 0.0010230416955891997, + "learning_rate": 2.830666666666667e-06, + "loss": 0.0, + "num_tokens": 455539.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 27.962962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0880391076207161, + "kl": 0.002278156578540802, + "learning_rate": 2.8303333333333332e-06, + "loss": 0.0001, + "num_tokens": 455747.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.75, + "completions/mean_terminated_length": 11.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 27.98148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.167452812194824, + "kl": 0.006329163908958435, + "learning_rate": 2.83e-06, + "loss": -0.0231, + "num_tokens": 456022.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 1511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 28.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04484656825661659, + "kl": 0.0024207322858273983, + "learning_rate": 2.829666666666667e-06, + "loss": 0.0001, + "num_tokens": 456333.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 28.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012770017609000206, + "kl": 0.0002797568013193086, + "learning_rate": 2.8293333333333336e-06, + "loss": 0.0, + "num_tokens": 456640.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 0.0, + "step": 1513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 28.037037037037038, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8218302726745605, + "kl": 0.0563189834356308, + "learning_rate": 2.829e-06, + "loss": 0.0101, + "num_tokens": 456976.0, + "reward": 2.0, + "reward_std": 2.345207929611206, + "rewards/reward_combined/mean": 2.0, + "rewards/reward_combined/std": 2.345207929611206, + "step": 1514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 28.055555555555557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07622843235731125, + "kl": 0.009533829987049103, + "learning_rate": 2.8286666666666667e-06, + "loss": 0.0005, + "num_tokens": 457269.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 28.074074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00024697667686268687, + "kl": 0.00010000169277191162, + "learning_rate": 2.8283333333333335e-06, + "loss": 0.0, + "num_tokens": 457489.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 1516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 28.09259259259259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059532761573791504, + "kl": 0.0007642433047294617, + "learning_rate": 2.828e-06, + "loss": 0.0, + "num_tokens": 457697.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 28.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14826706051826477, + "kl": 0.016935094725340605, + "learning_rate": 2.8276666666666666e-06, + "loss": 0.0009, + "num_tokens": 458030.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 38.0, + "completions/mean_terminated_length": 38.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 28.12962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.900506973266602, + "kl": 0.09935533255338669, + "learning_rate": 2.8273333333333334e-06, + "loss": 0.0443, + "num_tokens": 458398.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 1519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 28.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054693274199962616, + "kl": 0.049269139766693115, + "learning_rate": 2.827e-06, + "loss": 0.0025, + "num_tokens": 458726.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 12.25, + "completions/mean_terminated_length": 12.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 28.166666666666668, + "frac_reward_zero_std": 0.0, + "grad_norm": 10.160628318786621, + "kl": 0.017831362085416913, + "learning_rate": 2.8266666666666666e-06, + "loss": 0.15, + "num_tokens": 458995.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 1521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 28.185185185185187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011075237765908241, + "kl": 0.00912595959380269, + "learning_rate": 2.8263333333333333e-06, + "loss": 0.0005, + "num_tokens": 459267.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 28.203703703703702, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016902657225728035, + "kl": 0.0006451904773712158, + "learning_rate": 2.8259999999999997e-06, + "loss": 0.0, + "num_tokens": 459527.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1523 + }, + { + "clip_ratio/high_max": 0.01515151560306549, + "clip_ratio/high_mean": 0.01515151560306549, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01515151560306549, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 28.22222222222222, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7644505500793457, + "kl": 0.0067447873298078775, + "learning_rate": 2.825666666666667e-06, + "loss": 0.0134, + "num_tokens": 459843.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 1524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 28.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022648751735687256, + "kl": 0.0008592535159550607, + "learning_rate": 2.8253333333333337e-06, + "loss": 0.0, + "num_tokens": 460160.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 23.75, + "completions/mean_terminated_length": 23.75, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 28.25925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.156835079193115, + "kl": 0.0632067397236824, + "learning_rate": 2.825e-06, + "loss": 0.0063, + "num_tokens": 460475.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 1526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 28.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04755140841007233, + "kl": 0.003388937722775154, + "learning_rate": 2.824666666666667e-06, + "loss": 0.0002, + "num_tokens": 460745.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 28.296296296296298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026772283017635345, + "kl": 0.008730332367122173, + "learning_rate": 2.824333333333333e-06, + "loss": 0.0004, + "num_tokens": 461046.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 32.25, + "completions/mean_terminated_length": 32.25, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 28.314814814814813, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.9703738689422607, + "kl": 0.08144580945372581, + "learning_rate": 2.824e-06, + "loss": 0.0628, + "num_tokens": 461403.0, + "reward": 4.625, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 1529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 28.333333333333332, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.690491199493408, + "kl": 0.04787398502230644, + "learning_rate": 2.8236666666666668e-06, + "loss": 0.0311, + "num_tokens": 461725.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 1530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 28.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009542643092572689, + "kl": 0.0005669295787811279, + "learning_rate": 2.8233333333333335e-06, + "loss": 0.0, + "num_tokens": 461941.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 28.37037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3834571838378906, + "kl": 0.009222902008332312, + "learning_rate": 2.823e-06, + "loss": -0.0002, + "num_tokens": 462237.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 1532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 28.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005147993564605713, + "kl": 0.01599263586103916, + "learning_rate": 2.8226666666666667e-06, + "loss": 0.0008, + "num_tokens": 462497.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.5, + "completions/mean_terminated_length": 5.5, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 28.40740740740741, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.021935939788818, + "kl": 0.004281937421183102, + "learning_rate": 2.8223333333333335e-06, + "loss": 0.0933, + "num_tokens": 462719.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 1534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 33.75, + "completions/mean_terminated_length": 33.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 28.425925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060498569160699844, + "kl": 0.02687366772443056, + "learning_rate": 2.822e-06, + "loss": 0.0014, + "num_tokens": 463078.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 28.444444444444443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05655942112207413, + "kl": 0.0014583574957214296, + "learning_rate": 2.821666666666667e-06, + "loss": 0.0001, + "num_tokens": 463350.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 28.462962962962962, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.3880438804626465, + "kl": 0.10363345220685005, + "learning_rate": 2.8213333333333334e-06, + "loss": 0.033, + "num_tokens": 463655.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 1537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 28.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07333117723464966, + "kl": 0.01207177247852087, + "learning_rate": 2.821e-06, + "loss": 0.0006, + "num_tokens": 463955.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 28.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049097783863544464, + "kl": 0.16059517115354538, + "learning_rate": 2.8206666666666665e-06, + "loss": 0.008, + "num_tokens": 464265.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 28.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023787448182702065, + "kl": 0.0008872672915458679, + "learning_rate": 2.8203333333333333e-06, + "loss": 0.0, + "num_tokens": 464477.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 28.537037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007525818422436714, + "kl": 0.0009126712975557894, + "learning_rate": 2.82e-06, + "loss": 0.0, + "num_tokens": 464693.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 28.555555555555557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08046360313892365, + "kl": 0.01114275585860014, + "learning_rate": 2.819666666666667e-06, + "loss": 0.0006, + "num_tokens": 464967.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 28.574074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0992235317826271, + "kl": 0.02111656591296196, + "learning_rate": 2.8193333333333337e-06, + "loss": 0.0011, + "num_tokens": 465261.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 35.25, + "completions/mean_terminated_length": 35.25, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 28.59259259259259, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.259025812149048, + "kl": 0.05788859911262989, + "learning_rate": 2.819e-06, + "loss": 0.3244, + "num_tokens": 465622.0, + "reward": 3.5, + "reward_std": 4.690415859222412, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 4.690415859222412, + "step": 1544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 28.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06180359795689583, + "kl": 0.004265672294422984, + "learning_rate": 2.818666666666667e-06, + "loss": 0.0002, + "num_tokens": 465901.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 28.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0129468441009521, + "kl": 0.064463309943676, + "learning_rate": 2.818333333333333e-06, + "loss": 0.0032, + "num_tokens": 466137.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 28.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03907158598303795, + "kl": 0.007112330291420221, + "learning_rate": 2.818e-06, + "loss": 0.0004, + "num_tokens": 466407.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 28.666666666666668, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0027319842483848333, + "kl": 0.0003667546552605927, + "learning_rate": 2.8176666666666667e-06, + "loss": 0.0, + "num_tokens": 466721.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 28.685185185185187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0143232187256217, + "kl": 0.0007744103495497257, + "learning_rate": 2.8173333333333335e-06, + "loss": 0.0, + "num_tokens": 466977.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 28.703703703703702, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052229199558496475, + "kl": 0.011092833708971739, + "learning_rate": 2.817e-06, + "loss": 0.0006, + "num_tokens": 467304.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 28.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02033820189535618, + "kl": 0.006365820998325944, + "learning_rate": 2.8166666666666667e-06, + "loss": 0.0003, + "num_tokens": 467572.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 28.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019550353288650513, + "kl": 0.001157495629740879, + "learning_rate": 2.8163333333333334e-06, + "loss": 0.0001, + "num_tokens": 467834.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 28.75925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.208601951599121, + "kl": 0.05314292386174202, + "learning_rate": 2.8160000000000002e-06, + "loss": 0.0846, + "num_tokens": 468123.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 1553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 28.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7484064698219299, + "kl": 0.13676604256033897, + "learning_rate": 2.815666666666667e-06, + "loss": 0.0068, + "num_tokens": 468419.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 28.796296296296298, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.701013565063477, + "kl": 0.019662877544760704, + "learning_rate": 2.8153333333333334e-06, + "loss": 0.2003, + "num_tokens": 468692.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 1555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 28.814814814814813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0669856145977974, + "kl": 0.010359282605350018, + "learning_rate": 2.815e-06, + "loss": 0.0005, + "num_tokens": 469014.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 28.833333333333332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02854492887854576, + "kl": 0.0026429988211020827, + "learning_rate": 2.8146666666666665e-06, + "loss": 0.0001, + "num_tokens": 469302.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 28.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12357215583324432, + "kl": 0.0030721084913238883, + "learning_rate": 2.8143333333333333e-06, + "loss": 0.0001, + "num_tokens": 469536.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 28.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7089741826057434, + "kl": 0.06357227265834808, + "learning_rate": 2.814e-06, + "loss": 0.0032, + "num_tokens": 469814.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 28.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02073819376528263, + "kl": 0.09475615248084068, + "learning_rate": 2.813666666666667e-06, + "loss": 0.0047, + "num_tokens": 470178.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 28.90740740740741, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18267345428466797, + "kl": 0.011012335307896137, + "learning_rate": 2.8133333333333336e-06, + "loss": 0.0005, + "num_tokens": 470421.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 28.925925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13044600188732147, + "kl": 0.010010089725255966, + "learning_rate": 2.813e-06, + "loss": 0.0005, + "num_tokens": 470688.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 28.944444444444443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15959681570529938, + "kl": 0.010754904244095087, + "learning_rate": 2.8126666666666668e-06, + "loss": 0.0005, + "num_tokens": 470968.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 28.962962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010415318422019482, + "kl": 0.0018611857667565346, + "learning_rate": 2.812333333333333e-06, + "loss": 0.0001, + "num_tokens": 471280.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 28.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20058171451091766, + "kl": 0.0511362012475729, + "learning_rate": 2.812e-06, + "loss": 0.0026, + "num_tokens": 471585.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 29.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04505414888262749, + "kl": 0.05116620287299156, + "learning_rate": 2.8116666666666667e-06, + "loss": 0.0026, + "num_tokens": 471990.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 1566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 29.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008563397452235222, + "kl": 0.0004373746196506545, + "learning_rate": 2.8113333333333335e-06, + "loss": 0.0, + "num_tokens": 472225.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 29.037037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038594141602516174, + "kl": 0.004309385549277067, + "learning_rate": 2.8110000000000003e-06, + "loss": 0.0002, + "num_tokens": 472549.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 29.055555555555557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12808479368686676, + "kl": 0.009709966834634542, + "learning_rate": 2.8106666666666666e-06, + "loss": 0.0005, + "num_tokens": 472811.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 31.5, + "completions/mean_terminated_length": 31.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 29.074074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09926086664199829, + "kl": 0.03737693093717098, + "learning_rate": 2.8103333333333334e-06, + "loss": 0.0019, + "num_tokens": 473161.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 29.09259259259259, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.2803255319595337, + "kl": 0.167035561054945, + "learning_rate": 2.81e-06, + "loss": 0.0075, + "num_tokens": 473483.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 29.11111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.206502676010132, + "kl": 0.0034317674580961466, + "learning_rate": 2.809666666666667e-06, + "loss": 0.0138, + "num_tokens": 473789.0, + "reward": 2.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 3.5, + "step": 1572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 32.75, + "completions/mean_terminated_length": 32.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 29.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08774306625127792, + "kl": 0.06965498067438602, + "learning_rate": 2.8093333333333333e-06, + "loss": 0.0034, + "num_tokens": 474200.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 1573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 29.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09498246014118195, + "kl": 0.010398188140243292, + "learning_rate": 2.809e-06, + "loss": 0.0005, + "num_tokens": 474502.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.25, + "completions/mean_terminated_length": 12.25, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 29.166666666666668, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09661281853914261, + "kl": 0.013702782180189388, + "learning_rate": 2.8086666666666665e-06, + "loss": 0.0007, + "num_tokens": 474775.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 29.185185185185187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022496992722153664, + "kl": 0.00807579094544053, + "learning_rate": 2.8083333333333333e-06, + "loss": 0.0004, + "num_tokens": 475079.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 29.203703703703702, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014664665795862675, + "kl": 0.0002637431025505066, + "learning_rate": 2.808e-06, + "loss": 0.0, + "num_tokens": 475291.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 29.22222222222222, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.122772693634033, + "kl": 0.2681492482079193, + "learning_rate": 2.807666666666667e-06, + "loss": 0.0488, + "num_tokens": 475573.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 1578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 29.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022311396896839142, + "kl": 0.0036019354593008757, + "learning_rate": 2.8073333333333336e-06, + "loss": 0.0002, + "num_tokens": 475833.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 29.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6579437255859375, + "kl": 0.0701053871307522, + "learning_rate": 2.807e-06, + "loss": 0.0038, + "num_tokens": 476120.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 23.25, + "completions/mean_terminated_length": 23.25, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 29.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22422263026237488, + "kl": 0.075319929048419, + "learning_rate": 2.8066666666666668e-06, + "loss": 0.0038, + "num_tokens": 476433.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 29.296296296296298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004530862905085087, + "kl": 0.00031580403447151184, + "learning_rate": 2.806333333333333e-06, + "loss": 0.0, + "num_tokens": 476677.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 29.314814814814813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03519684821367264, + "kl": 0.012216292787343264, + "learning_rate": 2.8060000000000003e-06, + "loss": 0.0006, + "num_tokens": 476961.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 29.333333333333332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007863408885896206, + "kl": 0.0005006988649256527, + "learning_rate": 2.8056666666666667e-06, + "loss": 0.0, + "num_tokens": 477280.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 29.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04942427575588226, + "kl": 0.15612779557704926, + "learning_rate": 2.8053333333333335e-06, + "loss": 0.0078, + "num_tokens": 477591.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 29.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13410508632659912, + "kl": 0.027443756349384785, + "learning_rate": 2.8050000000000002e-06, + "loss": 0.0014, + "num_tokens": 477889.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 28.75, + "completions/mean_terminated_length": 28.75, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 29.38888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.087430953979492, + "kl": 0.05423801206052303, + "learning_rate": 2.8046666666666666e-06, + "loss": -0.0041, + "num_tokens": 478240.0, + "reward": 3.375, + "reward_std": 0.25, + "rewards/reward_combined/mean": 3.375, + "rewards/reward_combined/std": 0.25, + "step": 1587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 29.40740740740741, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.397153854370117, + "kl": 0.04278162680566311, + "learning_rate": 2.8043333333333334e-06, + "loss": 0.0306, + "num_tokens": 478546.0, + "reward": 6.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 6.5, + "rewards/reward_combined/std": 2.0, + "step": 1588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 29.425925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0318794883787632, + "kl": 0.004159346804954112, + "learning_rate": 2.804e-06, + "loss": 0.0002, + "num_tokens": 478837.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 29.444444444444443, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.154887676239014, + "kl": 0.3436839394271374, + "learning_rate": 2.803666666666667e-06, + "loss": 0.0825, + "num_tokens": 479190.0, + "reward": 6.625, + "reward_std": 2.0966243743896484, + "rewards/reward_combined/mean": 6.625, + "rewards/reward_combined/std": 2.0966243743896484, + "step": 1590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 29.462962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07447133213281631, + "kl": 0.0019177318026777357, + "learning_rate": 2.8033333333333333e-06, + "loss": 0.0001, + "num_tokens": 479446.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 29.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01250352244824171, + "kl": 0.008539030328392982, + "learning_rate": 2.803e-06, + "loss": 0.0004, + "num_tokens": 479718.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 29.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.671222686767578, + "kl": 0.03675028495490551, + "learning_rate": 2.8026666666666665e-06, + "loss": 0.1471, + "num_tokens": 479986.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 1593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 29.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10601507127285004, + "kl": 0.002778945490717888, + "learning_rate": 2.8023333333333332e-06, + "loss": 0.0001, + "num_tokens": 480213.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 29.537037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2422441691160202, + "kl": 0.06250947341322899, + "learning_rate": 2.802e-06, + "loss": 0.0034, + "num_tokens": 480514.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 29.555555555555557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00024913743254728615, + "kl": 9.768456220626831e-05, + "learning_rate": 2.801666666666667e-06, + "loss": 0.0, + "num_tokens": 480734.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 1596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 29.574074074074073, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.347248077392578, + "kl": 0.13809160143136978, + "learning_rate": 2.8013333333333336e-06, + "loss": 0.054, + "num_tokens": 481062.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 1597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 29.59259259259259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08372911810874939, + "kl": 0.007937636459246278, + "learning_rate": 2.801e-06, + "loss": 0.0004, + "num_tokens": 481392.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 29.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006388956680893898, + "kl": 0.015753159299492836, + "learning_rate": 2.8006666666666667e-06, + "loss": 0.0008, + "num_tokens": 481652.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 29.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004103833809494972, + "kl": 0.00015307665307773277, + "learning_rate": 2.800333333333333e-06, + "loss": 0.0, + "num_tokens": 481872.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 29.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012417380698025227, + "kl": 0.0006692036986351013, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0, + "num_tokens": 482132.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 29.666666666666668, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10515747219324112, + "kl": 0.016220704652369022, + "learning_rate": 2.7996666666666667e-06, + "loss": 0.0009, + "num_tokens": 482456.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 29.685185185185187, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.032846450805664, + "kl": 0.051607828587293625, + "learning_rate": 2.7993333333333334e-06, + "loss": -0.0995, + "num_tokens": 482734.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 1603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 29.703703703703702, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009661445394158363, + "kl": 0.0017271991819143295, + "learning_rate": 2.7990000000000002e-06, + "loss": 0.0001, + "num_tokens": 483046.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 35.25, + "completions/mean_terminated_length": 35.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 29.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2698768079280853, + "kl": 0.08649946004152298, + "learning_rate": 2.7986666666666666e-06, + "loss": 0.0041, + "num_tokens": 483403.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 29.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06218300014734268, + "kl": 0.006660278420895338, + "learning_rate": 2.7983333333333334e-06, + "loss": 0.0003, + "num_tokens": 483675.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 29.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0028913540299981833, + "kl": 0.0006885349866934121, + "learning_rate": 2.798e-06, + "loss": 0.0, + "num_tokens": 483935.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 29.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5006610155105591, + "kl": 0.06085568247362971, + "learning_rate": 2.797666666666667e-06, + "loss": 0.003, + "num_tokens": 484226.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 29.796296296296298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006548902485519648, + "kl": 0.0039421889232471585, + "learning_rate": 2.7973333333333333e-06, + "loss": 0.0002, + "num_tokens": 484494.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 29.814814814814813, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.824710845947266, + "kl": 0.010844754579011351, + "learning_rate": 2.797e-06, + "loss": 0.253, + "num_tokens": 484802.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 1610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 29.833333333333332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041225120425224304, + "kl": 0.0008110776543617249, + "learning_rate": 2.7966666666666664e-06, + "loss": 0.0001, + "num_tokens": 485010.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 29.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04550207778811455, + "kl": 0.006921528605744243, + "learning_rate": 2.7963333333333332e-06, + "loss": 0.0004, + "num_tokens": 485346.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 29.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09664245694875717, + "kl": 0.02657230943441391, + "learning_rate": 2.7960000000000004e-06, + "loss": 0.0013, + "num_tokens": 485635.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 29.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.3565778732299805, + "kl": 0.6150377094745636, + "learning_rate": 2.7956666666666668e-06, + "loss": 0.0308, + "num_tokens": 485935.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 29.90740740740741, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.4122538566589355, + "kl": 0.16321531683206558, + "learning_rate": 2.7953333333333336e-06, + "loss": -0.0418, + "num_tokens": 486209.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 1615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 29.925925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008500074036419392, + "kl": 0.002206355333328247, + "learning_rate": 2.795e-06, + "loss": 0.0001, + "num_tokens": 486445.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 29.944444444444443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024119025096297264, + "kl": 0.09382932633161545, + "learning_rate": 2.7946666666666667e-06, + "loss": 0.0047, + "num_tokens": 486809.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 29.962962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0712360367178917, + "kl": 0.004835872328840196, + "learning_rate": 2.7943333333333335e-06, + "loss": 0.0002, + "num_tokens": 487127.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 29.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08054126054048538, + "kl": 0.002262810943648219, + "learning_rate": 2.7940000000000003e-06, + "loss": 0.0002, + "num_tokens": 487343.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 30.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16639643907546997, + "kl": 0.00734925945289433, + "learning_rate": 2.7936666666666666e-06, + "loss": 0.0004, + "num_tokens": 487639.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 30.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05367019400000572, + "kl": 0.04935498721897602, + "learning_rate": 2.7933333333333334e-06, + "loss": 0.0025, + "num_tokens": 487967.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 30.037037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0482790581882, + "kl": 0.005178533028811216, + "learning_rate": 2.793e-06, + "loss": 0.0002, + "num_tokens": 488241.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 30.055555555555557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2608712613582611, + "kl": 0.014890595804899931, + "learning_rate": 2.7926666666666666e-06, + "loss": 0.0007, + "num_tokens": 488513.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 30.074074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06302923709154129, + "kl": 0.03252813499420881, + "learning_rate": 2.7923333333333333e-06, + "loss": 0.0017, + "num_tokens": 488805.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 30.09259259259259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037170834839344025, + "kl": 0.0008052513003349304, + "learning_rate": 2.792e-06, + "loss": 0.0, + "num_tokens": 489015.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 30.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02708226442337036, + "kl": 0.09318660199642181, + "learning_rate": 2.791666666666667e-06, + "loss": 0.0047, + "num_tokens": 489379.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 30.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1872541755437851, + "kl": 0.07842371612787247, + "learning_rate": 2.7913333333333333e-06, + "loss": 0.0034, + "num_tokens": 489709.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 30.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04665502905845642, + "kl": 0.0037679600063711405, + "learning_rate": 2.791e-06, + "loss": 0.0002, + "num_tokens": 489991.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 30.166666666666668, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002486102166585624, + "kl": 9.655207395553589e-05, + "learning_rate": 2.7906666666666664e-06, + "loss": 0.0, + "num_tokens": 490211.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 1629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 30.185185185185187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04373552277684212, + "kl": 0.00536091229878366, + "learning_rate": 2.7903333333333336e-06, + "loss": 0.0003, + "num_tokens": 490501.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 30.203703703703702, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.721534729003906, + "kl": 0.02560626238118857, + "learning_rate": 2.7900000000000004e-06, + "loss": 0.1521, + "num_tokens": 490793.0, + "reward": 7.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.875, + "rewards/reward_combined/std": 0.25, + "step": 1631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 30.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06082050874829292, + "kl": 0.00584647711366415, + "learning_rate": 2.7896666666666668e-06, + "loss": 0.0003, + "num_tokens": 491089.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 30.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03735148906707764, + "kl": 0.0037579393247142434, + "learning_rate": 2.7893333333333335e-06, + "loss": 0.0002, + "num_tokens": 491359.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 30.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04815084859728813, + "kl": 0.0004974707844667137, + "learning_rate": 2.789e-06, + "loss": 0.0, + "num_tokens": 491572.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 30.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07238934934139252, + "kl": 0.002373345196247101, + "learning_rate": 2.7886666666666667e-06, + "loss": 0.0001, + "num_tokens": 491832.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 30.296296296296298, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.438929557800293, + "kl": 0.008408480149228126, + "learning_rate": 2.7883333333333335e-06, + "loss": 0.2166, + "num_tokens": 492059.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 1636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 30.314814814814813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005664670839905739, + "kl": 0.004119423218071461, + "learning_rate": 2.7880000000000002e-06, + "loss": 0.0002, + "num_tokens": 492327.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 30.333333333333332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011643906123936176, + "kl": 0.008878742344677448, + "learning_rate": 2.7876666666666666e-06, + "loss": 0.0004, + "num_tokens": 492599.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 30.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22290551662445068, + "kl": 0.05015936307609081, + "learning_rate": 2.7873333333333334e-06, + "loss": 0.0025, + "num_tokens": 492903.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 30.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00424150051549077, + "kl": 0.0003255121409893036, + "learning_rate": 2.787e-06, + "loss": 0.0, + "num_tokens": 493147.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 30.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18955810368061066, + "kl": 0.13256411626935005, + "learning_rate": 2.7866666666666665e-06, + "loss": 0.0065, + "num_tokens": 493458.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 30.40740740740741, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008014434017241001, + "kl": 0.0022009164094924927, + "learning_rate": 2.7863333333333333e-06, + "loss": 0.0001, + "num_tokens": 493694.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 30.425925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05507928878068924, + "kl": 0.00199961184989661, + "learning_rate": 2.786e-06, + "loss": 0.0001, + "num_tokens": 493960.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 30.444444444444443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019806496798992157, + "kl": 0.0007509946881327778, + "learning_rate": 2.785666666666667e-06, + "loss": 0.0, + "num_tokens": 494216.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 30.462962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04594004154205322, + "kl": 0.006491444306448102, + "learning_rate": 2.7853333333333332e-06, + "loss": 0.0003, + "num_tokens": 494546.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 30.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.226511150598526, + "kl": 0.03469456639140844, + "learning_rate": 2.785e-06, + "loss": 0.0019, + "num_tokens": 494838.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 30.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04777868837118149, + "kl": 0.013245042180642486, + "learning_rate": 2.7846666666666664e-06, + "loss": 0.0007, + "num_tokens": 495110.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 30.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025857778266072273, + "kl": 0.0008294135332107544, + "learning_rate": 2.7843333333333336e-06, + "loss": 0.0, + "num_tokens": 495322.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 7.25, + "completions/mean_terminated_length": 7.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 30.537037037037038, + "frac_reward_zero_std": 0.0, + "grad_norm": 10.455300331115723, + "kl": 0.02141680009663105, + "learning_rate": 2.7840000000000004e-06, + "loss": 0.1231, + "num_tokens": 495559.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 1649 + }, + { + "clip_ratio/high_max": 0.010204081423580647, + "clip_ratio/high_mean": 0.010204081423580647, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.010204081423580647, + "completion_length": 23.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 23.25, + "completions/mean_terminated_length": 23.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 30.555555555555557, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.250657320022583, + "kl": 0.0780343022197485, + "learning_rate": 2.7836666666666667e-06, + "loss": -0.0085, + "num_tokens": 495872.0, + "reward": 1.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 1.75, + "rewards/reward_combined/std": 1.5, + "step": 1650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 27.25, + "completions/mean_terminated_length": 27.25, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 30.574074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1672305315732956, + "kl": 0.056687891483306885, + "learning_rate": 2.7833333333333335e-06, + "loss": 0.0028, + "num_tokens": 496209.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 30.59259259259259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0029121653642505407, + "kl": 0.0002818405773723498, + "learning_rate": 2.783e-06, + "loss": 0.0, + "num_tokens": 496521.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 30.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11224111914634705, + "kl": 0.014087349642068148, + "learning_rate": 2.7826666666666666e-06, + "loss": 0.0006, + "num_tokens": 496844.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 27.0, + "completions/mean_terminated_length": 27.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 30.62962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0967931747436523, + "kl": 0.1443721354007721, + "learning_rate": 2.7823333333333334e-06, + "loss": -0.0254, + "num_tokens": 497188.0, + "reward": 3.125, + "reward_std": 0.25, + "rewards/reward_combined/mean": 3.125, + "rewards/reward_combined/std": 0.25, + "step": 1654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 30.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3747810423374176, + "kl": 0.042956399731338024, + "learning_rate": 2.7820000000000002e-06, + "loss": 0.002, + "num_tokens": 497477.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 30.666666666666668, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012312564067542553, + "kl": 0.0035613150103017688, + "learning_rate": 2.7816666666666666e-06, + "loss": 0.0002, + "num_tokens": 497737.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 30.685185185185187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05929622799158096, + "kl": 0.0042467673774808645, + "learning_rate": 2.7813333333333334e-06, + "loss": 0.0002, + "num_tokens": 498039.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 30.703703703703702, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02958487533032894, + "kl": 0.0027180557372048497, + "learning_rate": 2.781e-06, + "loss": 0.0001, + "num_tokens": 498335.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 30.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03093082830309868, + "kl": 0.002050258044619113, + "learning_rate": 2.7806666666666665e-06, + "loss": 0.0001, + "num_tokens": 498597.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 30.74074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.855483055114746, + "kl": 0.0427514873445034, + "learning_rate": 2.7803333333333337e-06, + "loss": 0.0471, + "num_tokens": 498872.0, + "reward": 4.5, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 4.041451930999756, + "step": 1660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 30.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009033478796482086, + "kl": 0.015258362051099539, + "learning_rate": 2.78e-06, + "loss": 0.0008, + "num_tokens": 499132.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 37.25, + "completions/mean_terminated_length": 37.25, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 30.77777777777778, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.0366926193237305, + "kl": 0.04934370703995228, + "learning_rate": 2.779666666666667e-06, + "loss": -0.0764, + "num_tokens": 499497.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 1662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 30.796296296296298, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.5685930252075195, + "kl": 0.05688664689660072, + "learning_rate": 2.779333333333333e-06, + "loss": 0.0684, + "num_tokens": 499785.0, + "reward": 6.125, + "reward_std": 3.4247870445251465, + "rewards/reward_combined/mean": 6.125, + "rewards/reward_combined/std": 3.4247870445251465, + "step": 1663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 30.814814814814813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03696705400943756, + "kl": 0.003261456935433671, + "learning_rate": 2.779e-06, + "loss": 0.0002, + "num_tokens": 500108.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 30.833333333333332, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.941387176513672, + "kl": 0.018697240389883518, + "learning_rate": 2.7786666666666668e-06, + "loss": 0.0155, + "num_tokens": 500414.0, + "reward": 2.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 3.5, + "step": 1665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 30.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030107568949460983, + "kl": 0.0052183972438797355, + "learning_rate": 2.7783333333333336e-06, + "loss": 0.0003, + "num_tokens": 500762.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 30.87037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.573408126831055, + "kl": 0.013701325049623847, + "learning_rate": 2.7780000000000003e-06, + "loss": 0.0426, + "num_tokens": 501097.0, + "reward": 5.550000190734863, + "reward_std": 3.9000003337860107, + "rewards/reward_combined/mean": 5.550000190734863, + "rewards/reward_combined/std": 3.9000000953674316, + "step": 1667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 30.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07573556900024414, + "kl": 0.010404855944216251, + "learning_rate": 2.7776666666666667e-06, + "loss": 0.0005, + "num_tokens": 501397.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.011363636702299118, + "clip_ratio/low_min": 0.011363636702299118, + "clip_ratio/region_mean": 0.011363636702299118, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 30.90740740740741, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.8701605796813965, + "kl": 0.01183168776333332, + "learning_rate": 2.7773333333333335e-06, + "loss": 0.0591, + "num_tokens": 501735.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 1669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 30.925925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13676707446575165, + "kl": 0.047327421605587006, + "learning_rate": 2.777e-06, + "loss": 0.0024, + "num_tokens": 502027.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 45.0, + "completions/max_terminated_length": 45.0, + "completions/mean_length": 35.25, + "completions/mean_terminated_length": 35.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 30.944444444444443, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.428872585296631, + "kl": 0.049864813685417175, + "learning_rate": 2.7766666666666666e-06, + "loss": 0.1032, + "num_tokens": 502448.0, + "reward": 2.125, + "reward_std": 1.4361406564712524, + "rewards/reward_combined/mean": 2.125, + "rewards/reward_combined/std": 1.4361406564712524, + "step": 1671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 30.962962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014067181386053562, + "kl": 0.0022719979751855135, + "learning_rate": 2.7763333333333334e-06, + "loss": 0.0001, + "num_tokens": 502762.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 30.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011467641219496727, + "kl": 0.0005548670887947083, + "learning_rate": 2.776e-06, + "loss": 0.0, + "num_tokens": 502978.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 11.75, + "completions/mean_terminated_length": 11.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 31.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17998981475830078, + "kl": 0.030927646905183792, + "learning_rate": 2.7756666666666665e-06, + "loss": 0.0017, + "num_tokens": 503245.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 35.75, + "completions/mean_terminated_length": 35.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 31.01851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.40665602684021, + "kl": 0.07047554478049278, + "learning_rate": 2.7753333333333333e-06, + "loss": 0.0606, + "num_tokens": 503612.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 1675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 23.75, + "completions/mean_terminated_length": 23.75, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 31.037037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04566563665866852, + "kl": 0.07811117172241211, + "learning_rate": 2.775e-06, + "loss": 0.0039, + "num_tokens": 503975.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 31.055555555555557, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.729077100753784, + "kl": 0.056703547947108746, + "learning_rate": 2.774666666666667e-06, + "loss": 0.0821, + "num_tokens": 504287.0, + "reward": 2.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 3.5, + "step": 1677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 31.074074074074073, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4384732246398926, + "kl": 0.005358624504879117, + "learning_rate": 2.7743333333333337e-06, + "loss": -0.021, + "num_tokens": 504630.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 1678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 31.09259259259259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01795472949743271, + "kl": 0.0021169992396607995, + "learning_rate": 2.774e-06, + "loss": 0.0001, + "num_tokens": 504920.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 31.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0991416797041893, + "kl": 0.003429839853197336, + "learning_rate": 2.773666666666667e-06, + "loss": 0.0002, + "num_tokens": 505140.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 31.12962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.946234703063965, + "kl": 0.042977893725037575, + "learning_rate": 2.773333333333333e-06, + "loss": 0.0667, + "num_tokens": 505470.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 1681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 31.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06238122656941414, + "kl": 0.006720427889376879, + "learning_rate": 2.773e-06, + "loss": 0.0003, + "num_tokens": 505763.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 31.166666666666668, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003167220565956086, + "kl": 8.293986320495605e-05, + "learning_rate": 2.7726666666666667e-06, + "loss": 0.0, + "num_tokens": 505983.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 1683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 31.185185185185187, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.205992698669434, + "kl": 0.0023598200641572475, + "learning_rate": 2.7723333333333335e-06, + "loss": 0.3666, + "num_tokens": 506221.0, + "reward": 2.625, + "reward_std": 2.75, + "rewards/reward_combined/mean": 2.625, + "rewards/reward_combined/std": 2.75, + "step": 1684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 31.203703703703702, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012067733332514763, + "kl": 0.0007444173097610474, + "learning_rate": 2.7720000000000003e-06, + "loss": 0.0, + "num_tokens": 506481.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 31.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11107742041349411, + "kl": 0.03276311792433262, + "learning_rate": 2.7716666666666667e-06, + "loss": 0.0017, + "num_tokens": 506785.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 31.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04169462248682976, + "kl": 0.003586039907531813, + "learning_rate": 2.7713333333333335e-06, + "loss": 0.0002, + "num_tokens": 507053.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 31.25925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.579318523406982, + "kl": 0.08367524109780788, + "learning_rate": 2.771e-06, + "loss": 0.059, + "num_tokens": 507332.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 1688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 31.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010694318450987339, + "kl": 0.01508731348440051, + "learning_rate": 2.7706666666666666e-06, + "loss": 0.0008, + "num_tokens": 507592.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 35.5, + "completions/mean_terminated_length": 35.5, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 31.296296296296298, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.426229476928711, + "kl": 0.0468965582549572, + "learning_rate": 2.7703333333333334e-06, + "loss": -0.0879, + "num_tokens": 508014.0, + "reward": 1.5499999523162842, + "reward_std": 1.2556538581848145, + "rewards/reward_combined/mean": 1.5499999523162842, + "rewards/reward_combined/std": 1.2556538581848145, + "step": 1690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 31.314814814814813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08689247071743011, + "kl": 0.023760899901390076, + "learning_rate": 2.77e-06, + "loss": 0.0012, + "num_tokens": 508280.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 31.333333333333332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3459792733192444, + "kl": 0.03823373280465603, + "learning_rate": 2.769666666666667e-06, + "loss": 0.0023, + "num_tokens": 508560.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 31.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02969568409025669, + "kl": 0.006482162047177553, + "learning_rate": 2.7693333333333333e-06, + "loss": 0.0003, + "num_tokens": 508828.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 7.25, + "completions/mean_terminated_length": 7.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 31.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.28915145993232727, + "kl": 0.04711037874221802, + "learning_rate": 2.769e-06, + "loss": 0.0026, + "num_tokens": 509057.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 31.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005393181461840868, + "kl": 0.002824738621711731, + "learning_rate": 2.768666666666667e-06, + "loss": 0.0001, + "num_tokens": 509293.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 26.75, + "completions/mean_terminated_length": 26.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 31.40740740740741, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3077964782714844, + "kl": 0.089900903403759, + "learning_rate": 2.7683333333333337e-06, + "loss": -0.2119, + "num_tokens": 509636.0, + "reward": 3.375, + "reward_std": 0.25, + "rewards/reward_combined/mean": 3.375, + "rewards/reward_combined/std": 0.25, + "step": 1696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 31.425925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014041605405509472, + "kl": 0.0006363093852996826, + "learning_rate": 2.768e-06, + "loss": 0.0, + "num_tokens": 509844.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 31.444444444444443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07646717131137848, + "kl": 0.014523346908390522, + "learning_rate": 2.767666666666667e-06, + "loss": 0.0007, + "num_tokens": 510176.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 31.462962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13709478080272675, + "kl": 0.04975891299545765, + "learning_rate": 2.767333333333333e-06, + "loss": 0.0025, + "num_tokens": 510513.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 31.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1815960556268692, + "kl": 0.014224665239453316, + "learning_rate": 2.767e-06, + "loss": 0.0007, + "num_tokens": 510824.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 31.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.163992777466774, + "kl": 0.012088237330317497, + "learning_rate": 2.7666666666666667e-06, + "loss": 0.0006, + "num_tokens": 511092.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1701 + }, + { + "clip_ratio/high_max": 0.01515151560306549, + "clip_ratio/high_mean": 0.01515151560306549, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01515151560306549, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 31.51851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.297525405883789, + "kl": 0.07614044658839703, + "learning_rate": 2.7663333333333335e-06, + "loss": -0.1242, + "num_tokens": 511401.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 1702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 31.537037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19524890184402466, + "kl": 0.022649593651294708, + "learning_rate": 2.7660000000000003e-06, + "loss": 0.001, + "num_tokens": 511693.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 31.555555555555557, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.090134143829346, + "kl": 0.04897109046578407, + "learning_rate": 2.7656666666666666e-06, + "loss": 0.0403, + "num_tokens": 511988.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 1704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 31.574074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19848203659057617, + "kl": 0.049016520380973816, + "learning_rate": 2.7653333333333334e-06, + "loss": 0.0025, + "num_tokens": 512292.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.012820512987673283, + "clip_ratio/low_min": 0.012820512987673283, + "clip_ratio/region_mean": 0.012820512987673283, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 31.59259259259259, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2279224395751953, + "kl": 0.009315238101407886, + "learning_rate": 2.7649999999999998e-06, + "loss": -0.1678, + "num_tokens": 512605.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 1706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 31.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010073486715555191, + "kl": 0.004210499115288258, + "learning_rate": 2.764666666666667e-06, + "loss": 0.0002, + "num_tokens": 512873.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 31.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03109670616686344, + "kl": 0.002433445304632187, + "learning_rate": 2.7643333333333334e-06, + "loss": 0.0001, + "num_tokens": 513185.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 31.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08693189173936844, + "kl": 0.00425832875771448, + "learning_rate": 2.764e-06, + "loss": 0.0002, + "num_tokens": 513441.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 32.75, + "completions/mean_terminated_length": 32.75, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 31.666666666666668, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.633188486099243, + "kl": 0.3348800539970398, + "learning_rate": 2.763666666666667e-06, + "loss": 0.0498, + "num_tokens": 513800.0, + "reward": 4.75, + "reward_std": 3.4034295082092285, + "rewards/reward_combined/mean": 4.75, + "rewards/reward_combined/std": 3.4034297466278076, + "step": 1710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 31.685185185185187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06907296925783157, + "kl": 0.016301962081342936, + "learning_rate": 2.7633333333333333e-06, + "loss": 0.0008, + "num_tokens": 514127.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 31.703703703703702, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.860721588134766, + "kl": 0.021684397011995316, + "learning_rate": 2.763e-06, + "loss": 0.1494, + "num_tokens": 514455.0, + "reward": 5.25, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 4.5, + "step": 1712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 31.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5636835098266602, + "kl": 0.050710033625364304, + "learning_rate": 2.762666666666667e-06, + "loss": 0.0025, + "num_tokens": 514750.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1713 + }, + { + "clip_ratio/high_max": 0.006666666828095913, + "clip_ratio/high_mean": 0.006666666828095913, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006666666828095913, + "completion_length": 35.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 35.5, + "completions/mean_terminated_length": 35.5, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 31.74074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.100022315979004, + "kl": 0.0645565465092659, + "learning_rate": 2.7623333333333336e-06, + "loss": 0.0094, + "num_tokens": 515108.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 1714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 31.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06662952154874802, + "kl": 0.1508898138999939, + "learning_rate": 2.762e-06, + "loss": 0.0075, + "num_tokens": 515417.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 31.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10787297040224075, + "kl": 0.022396287880837917, + "learning_rate": 2.7616666666666668e-06, + "loss": 0.0011, + "num_tokens": 515704.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 31.796296296296298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1203509047627449, + "kl": 0.014909719116985798, + "learning_rate": 2.761333333333333e-06, + "loss": 0.0008, + "num_tokens": 516013.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 31.814814814814813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06831133365631104, + "kl": 0.004663305822759867, + "learning_rate": 2.761e-06, + "loss": 0.0002, + "num_tokens": 516277.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 31.833333333333332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0066061001271009445, + "kl": 0.0006123781204223633, + "learning_rate": 2.7606666666666667e-06, + "loss": 0.0, + "num_tokens": 516521.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 31.85185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.299671173095703, + "kl": 0.01803624164313078, + "learning_rate": 2.7603333333333335e-06, + "loss": 0.0027, + "num_tokens": 516795.0, + "reward": 4.5, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 4.041451930999756, + "step": 1720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 31.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014989754185080528, + "kl": 0.004661129554733634, + "learning_rate": 2.7600000000000003e-06, + "loss": 0.0002, + "num_tokens": 517069.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 31.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021326709538698196, + "kl": 0.0007004812359809875, + "learning_rate": 2.7596666666666666e-06, + "loss": 0.0, + "num_tokens": 517281.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 31.90740740740741, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022083792835474014, + "kl": 0.0032550841569900513, + "learning_rate": 2.7593333333333334e-06, + "loss": 0.0002, + "num_tokens": 517541.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 31.925925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12020346522331238, + "kl": 0.04598797671496868, + "learning_rate": 2.7589999999999998e-06, + "loss": 0.0024, + "num_tokens": 517837.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 6.0, + "completions/mean_terminated_length": 6.0, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 31.944444444444443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.26906564831733704, + "kl": 0.028645590879023075, + "learning_rate": 2.758666666666667e-06, + "loss": 0.0014, + "num_tokens": 518069.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 31.962962962962962, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.644770622253418, + "kl": 0.036845942959189415, + "learning_rate": 2.7583333333333333e-06, + "loss": 0.0417, + "num_tokens": 518401.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 1726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 31.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001290903310291469, + "kl": 0.0011155882384628057, + "learning_rate": 2.758e-06, + "loss": 0.0001, + "num_tokens": 518681.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 32.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02473565563559532, + "kl": 0.014539883937686682, + "learning_rate": 2.757666666666667e-06, + "loss": 0.0007, + "num_tokens": 518965.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 32.01851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.836700439453125, + "kl": 0.031715997494757175, + "learning_rate": 2.7573333333333332e-06, + "loss": -0.0034, + "num_tokens": 519261.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 1729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 32.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08739776164293289, + "kl": 0.001209259033203125, + "learning_rate": 2.757e-06, + "loss": 0.0001, + "num_tokens": 519473.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 32.05555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.223323345184326, + "kl": 0.026694633066654205, + "learning_rate": 2.756666666666667e-06, + "loss": -0.0343, + "num_tokens": 519762.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 1731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 32.074074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.1310564279556274, + "kl": 0.10389725491404533, + "learning_rate": 2.7563333333333336e-06, + "loss": 0.0064, + "num_tokens": 520011.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 32.092592592592595, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.973709583282471, + "kl": 0.12362165376543999, + "learning_rate": 2.756e-06, + "loss": -0.0595, + "num_tokens": 520316.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 1733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 32.111111111111114, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.902437686920166, + "kl": 0.053349819034338, + "learning_rate": 2.7556666666666667e-06, + "loss": 0.3656, + "num_tokens": 520552.0, + "reward": 3.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 3.875, + "rewards/reward_combined/std": 0.25, + "step": 1734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 32.129629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14474616944789886, + "kl": 0.008931699441745877, + "learning_rate": 2.755333333333333e-06, + "loss": 0.0005, + "num_tokens": 520819.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 32.148148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015790093690156937, + "kl": 0.003581728204153478, + "learning_rate": 2.755e-06, + "loss": 0.0002, + "num_tokens": 521099.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 26.25, + "completions/mean_terminated_length": 26.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 32.166666666666664, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.182296276092529, + "kl": 0.10936189070343971, + "learning_rate": 2.754666666666667e-06, + "loss": -0.035, + "num_tokens": 521424.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 1737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 32.18518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.206444263458252, + "kl": 0.10994753241539001, + "learning_rate": 2.7543333333333334e-06, + "loss": 0.0055, + "num_tokens": 521644.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 1738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 7.0, + "completions/mean_terminated_length": 7.0, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 32.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02155369333922863, + "kl": 0.0005641579627990723, + "learning_rate": 2.7540000000000002e-06, + "loss": 0.0, + "num_tokens": 521880.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 32.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10336758941411972, + "kl": 0.038915976881980896, + "learning_rate": 2.7536666666666666e-06, + "loss": 0.0019, + "num_tokens": 522208.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 32.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09731552749872208, + "kl": 0.01642657583579421, + "learning_rate": 2.7533333333333334e-06, + "loss": 0.0009, + "num_tokens": 522480.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 32.25925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.575925827026367, + "kl": 0.34525352716445923, + "learning_rate": 2.753e-06, + "loss": 0.0599, + "num_tokens": 522780.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 1742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 32.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023094866424798965, + "kl": 0.014685933012515306, + "learning_rate": 2.752666666666667e-06, + "loss": 0.0007, + "num_tokens": 523064.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 32.2962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0287400484085083, + "kl": 0.15829047560691833, + "learning_rate": 2.7523333333333333e-06, + "loss": -0.0132, + "num_tokens": 523430.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 1744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 32.31481481481482, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.175476551055908, + "kl": 0.028773130849003792, + "learning_rate": 2.752e-06, + "loss": -0.0359, + "num_tokens": 523741.0, + "reward": 1.625, + "reward_std": 1.25, + "rewards/reward_combined/mean": 1.625, + "rewards/reward_combined/std": 1.25, + "step": 1745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 32.333333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021959371864795685, + "kl": 0.0035362214548513293, + "learning_rate": 2.751666666666667e-06, + "loss": 0.0002, + "num_tokens": 524001.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 45.0, + "completions/max_terminated_length": 45.0, + "completions/mean_length": 36.75, + "completions/mean_terminated_length": 36.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 32.351851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5844467282295227, + "kl": 0.08374781534075737, + "learning_rate": 2.7513333333333332e-06, + "loss": 0.0047, + "num_tokens": 524372.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 45.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 90.0, + "completions/max_terminated_length": 90.0, + "completions/mean_length": 45.25, + "completions/mean_terminated_length": 45.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 32.370370370370374, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.082003116607666, + "kl": 0.061678726226091385, + "learning_rate": 2.751e-06, + "loss": 0.3032, + "num_tokens": 524789.0, + "reward": 0.6749999523162842, + "reward_std": 3.4480671882629395, + "rewards/reward_combined/mean": 0.6749999523162842, + "rewards/reward_combined/std": 3.4480671882629395, + "step": 1748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 32.388888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0972365215420723, + "kl": 0.014674036763608456, + "learning_rate": 2.7506666666666668e-06, + "loss": 0.0008, + "num_tokens": 525099.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 32.25, + "completions/mean_terminated_length": 32.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 32.407407407407405, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2397584915161133, + "kl": 0.042815130203962326, + "learning_rate": 2.7503333333333336e-06, + "loss": 0.0409, + "num_tokens": 525508.0, + "reward": 2.375, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 1.25, + "step": 1750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 32.425925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2768588960170746, + "kl": 0.10103439539670944, + "learning_rate": 2.75e-06, + "loss": 0.0051, + "num_tokens": 525837.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 32.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10606963187456131, + "kl": 0.02139665000140667, + "learning_rate": 2.7496666666666667e-06, + "loss": 0.0011, + "num_tokens": 526174.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 32.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03547282889485359, + "kl": 0.005235506920143962, + "learning_rate": 2.749333333333333e-06, + "loss": 0.0003, + "num_tokens": 526464.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 32.48148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.4175496101379395, + "kl": 0.1552463248372078, + "learning_rate": 2.7490000000000003e-06, + "loss": 0.0445, + "num_tokens": 526778.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 1754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 32.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04348023980855942, + "kl": 0.003014126908965409, + "learning_rate": 2.748666666666667e-06, + "loss": 0.0002, + "num_tokens": 527081.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 32.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5907437205314636, + "kl": 0.032865116372704506, + "learning_rate": 2.7483333333333334e-06, + "loss": 0.0022, + "num_tokens": 527299.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 32.53703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.109654903411865, + "kl": 0.017590856179594994, + "learning_rate": 2.748e-06, + "loss": 0.1314, + "num_tokens": 527630.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 1757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 35.25, + "completions/mean_terminated_length": 35.25, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 32.55555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3732783794403076, + "kl": 0.06276243925094604, + "learning_rate": 2.7476666666666666e-06, + "loss": 0.1708, + "num_tokens": 527987.0, + "reward": 4.5, + "reward_std": 2.345207929611206, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 2.345207929611206, + "step": 1758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 32.574074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0586363710463047, + "kl": 0.002315439283847809, + "learning_rate": 2.7473333333333333e-06, + "loss": 0.0001, + "num_tokens": 528195.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 32.592592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10004890710115433, + "kl": 0.01631145551800728, + "learning_rate": 2.747e-06, + "loss": 0.0008, + "num_tokens": 528455.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.75, + "completions/mean_terminated_length": 11.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 32.611111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23430277407169342, + "kl": 0.02228065828603576, + "learning_rate": 2.746666666666667e-06, + "loss": 0.0011, + "num_tokens": 528726.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 5.75, + "completions/mean_terminated_length": 5.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 32.629629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8634611368179321, + "kl": 0.051091745495796204, + "learning_rate": 2.7463333333333333e-06, + "loss": 0.0037, + "num_tokens": 528969.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 32.648148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07443425059318542, + "kl": 0.011200740467756987, + "learning_rate": 2.746e-06, + "loss": 0.0006, + "num_tokens": 529294.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 32.666666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08659444749355316, + "kl": 0.04875837825238705, + "learning_rate": 2.745666666666667e-06, + "loss": 0.0024, + "num_tokens": 529628.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 32.68518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015384593280032277, + "kl": 0.000998933392111212, + "learning_rate": 2.745333333333333e-06, + "loss": 0.0, + "num_tokens": 529908.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 32.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1019458994269371, + "kl": 0.007903525372967124, + "learning_rate": 2.745e-06, + "loss": 0.0004, + "num_tokens": 530236.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 32.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028836917132139206, + "kl": 0.002044334774836898, + "learning_rate": 2.7446666666666668e-06, + "loss": 0.0001, + "num_tokens": 530498.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 85.75, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 85.75, + "completions/mean_terminated_length": 29.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 32.74074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.466571092605591, + "kl": 0.03677182085812092, + "learning_rate": 2.7443333333333335e-06, + "loss": 0.4235, + "num_tokens": 531057.0, + "reward": 2.049999952316284, + "reward_std": 4.0509257316589355, + "rewards/reward_combined/mean": 2.049999952316284, + "rewards/reward_combined/std": 4.0509257316589355, + "step": 1768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 73.25, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 73.25, + "completions/mean_terminated_length": 12.333333969116211, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 32.75925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5762598514556885, + "kl": 0.013551326934248209, + "learning_rate": 2.744e-06, + "loss": 0.4315, + "num_tokens": 531570.0, + "reward": 4.550000190734863, + "reward_std": 3.5930488109588623, + "rewards/reward_combined/mean": 4.550000190734863, + "rewards/reward_combined/std": 3.5930488109588623, + "step": 1769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 32.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07321508228778839, + "kl": 0.005755336955189705, + "learning_rate": 2.7436666666666667e-06, + "loss": 0.0003, + "num_tokens": 531893.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 32.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0435134693980217, + "kl": 0.00424116151407361, + "learning_rate": 2.743333333333333e-06, + "loss": 0.0002, + "num_tokens": 532177.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 32.81481481481482, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.904435157775879, + "kl": 0.04722149111330509, + "learning_rate": 2.7430000000000002e-06, + "loss": 0.0164, + "num_tokens": 532469.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 1772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 32.833333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006464575883001089, + "kl": 0.01561238057911396, + "learning_rate": 2.742666666666667e-06, + "loss": 0.0008, + "num_tokens": 532729.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 32.851851851851855, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.428771018981934, + "kl": 0.06700919196009636, + "learning_rate": 2.7423333333333334e-06, + "loss": 0.0515, + "num_tokens": 533039.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 1774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 32.870370370370374, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.432555198669434, + "kl": 0.08067074045538902, + "learning_rate": 2.742e-06, + "loss": 0.0303, + "num_tokens": 533335.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 1775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 32.888888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05425287410616875, + "kl": 0.0026105031138285995, + "learning_rate": 2.7416666666666665e-06, + "loss": 0.0001, + "num_tokens": 533646.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 32.907407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009603277430869639, + "kl": 0.0004247836768627167, + "learning_rate": 2.7413333333333333e-06, + "loss": 0.0, + "num_tokens": 533906.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 32.925925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011686128564178944, + "kl": 0.0008441567624686286, + "learning_rate": 2.741e-06, + "loss": 0.0, + "num_tokens": 534125.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 32.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06443269550800323, + "kl": 0.0023990795016288757, + "learning_rate": 2.740666666666667e-06, + "loss": 0.0001, + "num_tokens": 534381.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 32.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03564298525452614, + "kl": 0.006909394636750221, + "learning_rate": 2.7403333333333332e-06, + "loss": 0.0004, + "num_tokens": 534651.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 32.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17307555675506592, + "kl": 0.05741347745060921, + "learning_rate": 2.74e-06, + "loss": 0.0029, + "num_tokens": 534958.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 33.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09630684554576874, + "kl": 0.011841883417218924, + "learning_rate": 2.739666666666667e-06, + "loss": 0.0006, + "num_tokens": 535231.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 73.75, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 73.75, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 33.01851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0882561206817627, + "kl": 0.04686512239277363, + "learning_rate": 2.739333333333333e-06, + "loss": 0.4561, + "num_tokens": 535742.0, + "reward": 5.925000190734863, + "reward_std": 4.150000095367432, + "rewards/reward_combined/mean": 5.925000190734863, + "rewards/reward_combined/std": 4.150000095367432, + "step": 1783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 33.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07120820134878159, + "kl": 0.16184628754854202, + "learning_rate": 2.7390000000000004e-06, + "loss": 0.0081, + "num_tokens": 536053.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 7.5, + "completions/mean_terminated_length": 7.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 33.05555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.441292762756348, + "kl": 0.08373421430587769, + "learning_rate": 2.7386666666666667e-06, + "loss": 0.286, + "num_tokens": 536287.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 1785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 33.074074074074076, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2790210247039795, + "kl": 0.15694626420736313, + "learning_rate": 2.7383333333333335e-06, + "loss": 0.0245, + "num_tokens": 536603.0, + "reward": 4.0, + "reward_std": 2.915475845336914, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 2.915475845336914, + "step": 1786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 33.092592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.47888123989105225, + "kl": 0.048293492989614606, + "learning_rate": 2.738e-06, + "loss": 0.0024, + "num_tokens": 536863.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 33.111111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0036101879086345434, + "kl": 0.0004414692521095276, + "learning_rate": 2.7376666666666667e-06, + "loss": 0.0, + "num_tokens": 537123.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 33.129629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06551062315702438, + "kl": 0.013508519157767296, + "learning_rate": 2.7373333333333334e-06, + "loss": 0.0007, + "num_tokens": 537429.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 7.25, + "completions/mean_terminated_length": 7.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 33.148148148148145, + "frac_reward_zero_std": 0.0, + "grad_norm": 10.506830215454102, + "kl": 0.00290696881711483, + "learning_rate": 2.7370000000000002e-06, + "loss": 0.3031, + "num_tokens": 537678.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 1790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 33.166666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15645378828048706, + "kl": 0.013062998652458191, + "learning_rate": 2.736666666666667e-06, + "loss": 0.0007, + "num_tokens": 537894.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 33.18518518518518, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.951319932937622, + "kl": 0.04044189304113388, + "learning_rate": 2.7363333333333334e-06, + "loss": -0.0002, + "num_tokens": 538298.0, + "reward": 2.375, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 1.25, + "step": 1792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 33.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059244900941848755, + "kl": 0.0010513767483644187, + "learning_rate": 2.736e-06, + "loss": 0.0001, + "num_tokens": 538511.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 33.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014438227750360966, + "kl": 0.0013181971735320985, + "learning_rate": 2.7356666666666665e-06, + "loss": 0.0001, + "num_tokens": 538820.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.75, + "completions/mean_terminated_length": 11.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 33.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13680005073547363, + "kl": 0.02158356038853526, + "learning_rate": 2.7353333333333333e-06, + "loss": 0.0012, + "num_tokens": 539091.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 33.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02796652540564537, + "kl": 0.09261302649974823, + "learning_rate": 2.735e-06, + "loss": 0.0046, + "num_tokens": 539457.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 27.5, + "completions/mean_terminated_length": 27.5, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 33.27777777777778, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.050706386566162, + "kl": 0.05082953721284866, + "learning_rate": 2.734666666666667e-06, + "loss": 0.0044, + "num_tokens": 539787.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 1797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 30.0, + "completions/mean_terminated_length": 30.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 33.2962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.560703754425049, + "kl": 0.1710280478000641, + "learning_rate": 2.7343333333333332e-06, + "loss": -0.014, + "num_tokens": 540135.0, + "reward": 5.375, + "reward_std": 2.75, + "rewards/reward_combined/mean": 5.375, + "rewards/reward_combined/std": 2.75, + "step": 1798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 33.31481481481482, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4310381412506104, + "kl": 0.04510759375989437, + "learning_rate": 2.734e-06, + "loss": 0.0159, + "num_tokens": 540464.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 1799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 33.333333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4282565116882324, + "kl": 0.07379312999546528, + "learning_rate": 2.7336666666666668e-06, + "loss": 0.0038, + "num_tokens": 540759.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 33.351851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15546134114265442, + "kl": 0.047861428931355476, + "learning_rate": 2.733333333333333e-06, + "loss": 0.0028, + "num_tokens": 541077.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 33.370370370370374, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.939266204833984, + "kl": 0.03174888156354427, + "learning_rate": 2.7330000000000003e-06, + "loss": 0.0013, + "num_tokens": 541337.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 1802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 33.388888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4484003782272339, + "kl": 0.023341971449553967, + "learning_rate": 2.7326666666666667e-06, + "loss": 0.0012, + "num_tokens": 541593.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 33.407407407407405, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.292440891265869, + "kl": 0.2836736887693405, + "learning_rate": 2.7323333333333335e-06, + "loss": 0.0335, + "num_tokens": 541878.0, + "reward": 6.125, + "reward_std": 3.4247870445251465, + "rewards/reward_combined/mean": 6.125, + "rewards/reward_combined/std": 3.4247870445251465, + "step": 1804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 33.425925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6213285326957703, + "kl": 0.08234158530831337, + "learning_rate": 2.732e-06, + "loss": 0.0045, + "num_tokens": 542168.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 2.5, + "completions/mean_terminated_length": 2.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 33.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045906562358140945, + "kl": 0.0012828707695007324, + "learning_rate": 2.7316666666666666e-06, + "loss": 0.0001, + "num_tokens": 542374.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.5, + "completions/mean_terminated_length": 5.5, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 33.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24615296721458435, + "kl": 0.010953473305562511, + "learning_rate": 2.7313333333333334e-06, + "loss": 0.0006, + "num_tokens": 542596.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 33.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036423616111278534, + "kl": 0.002574182115495205, + "learning_rate": 2.731e-06, + "loss": 0.0001, + "num_tokens": 542878.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 33.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.5580596923828125, + "kl": 0.04375036060810089, + "learning_rate": 2.730666666666667e-06, + "loss": 0.0604, + "num_tokens": 543207.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 1809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.005747126415371895, + "clip_ratio/low_min": 0.005747126415371895, + "clip_ratio/region_mean": 0.005747126415371895, + "completion_length": 41.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 47.0, + "completions/max_terminated_length": 47.0, + "completions/mean_length": 41.25, + "completions/mean_terminated_length": 41.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 33.51851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0974574089050293, + "kl": 0.06714414805173874, + "learning_rate": 2.7303333333333333e-06, + "loss": 0.1271, + "num_tokens": 543596.0, + "reward": 4.0, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 4.041451930999756, + "step": 1810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 33.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010035802610218525, + "kl": 0.00916808657348156, + "learning_rate": 2.73e-06, + "loss": 0.0005, + "num_tokens": 543868.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 33.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028103139251470566, + "kl": 0.002121154509950429, + "learning_rate": 2.7296666666666665e-06, + "loss": 0.0001, + "num_tokens": 544145.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 33.574074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0028382311575114727, + "kl": 0.00012265145778656006, + "learning_rate": 2.7293333333333333e-06, + "loss": 0.0, + "num_tokens": 544365.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 1813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 33.592592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12235185503959656, + "kl": 0.016257425770163536, + "learning_rate": 2.729e-06, + "loss": 0.0008, + "num_tokens": 544658.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 33.611111111111114, + "frac_reward_zero_std": 0.0, + "grad_norm": 10.487502098083496, + "kl": 0.06302844732999802, + "learning_rate": 2.728666666666667e-06, + "loss": 0.1364, + "num_tokens": 544920.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 1815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 23.25, + "completions/mean_terminated_length": 23.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 33.629629629629626, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.118229866027832, + "kl": 0.045467047952115536, + "learning_rate": 2.7283333333333336e-06, + "loss": 0.0879, + "num_tokens": 545257.0, + "reward": 4.5, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 4.041451930999756, + "step": 1816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 33.648148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08995601534843445, + "kl": 0.012283294927328825, + "learning_rate": 2.728e-06, + "loss": 0.0006, + "num_tokens": 545531.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 33.666666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5635440945625305, + "kl": 0.06689948117127642, + "learning_rate": 2.7276666666666668e-06, + "loss": 0.003, + "num_tokens": 545834.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 33.68518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04382326081395149, + "kl": 0.003987176809459925, + "learning_rate": 2.7273333333333335e-06, + "loss": 0.0002, + "num_tokens": 546130.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 82.75, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 82.75, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 33.7037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8575448989868164, + "kl": 0.017958277836441994, + "learning_rate": 2.7270000000000003e-06, + "loss": 0.3982, + "num_tokens": 546689.0, + "reward": 3.799999952316284, + "reward_std": 5.588082790374756, + "rewards/reward_combined/mean": 3.799999952316284, + "rewards/reward_combined/std": 5.588082313537598, + "step": 1820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 33.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1541658639907837, + "kl": 0.020028742030262947, + "learning_rate": 2.7266666666666667e-06, + "loss": 0.001, + "num_tokens": 547001.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.75, + "completions/mean_terminated_length": 12.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 33.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15733495354652405, + "kl": 0.011172421742230654, + "learning_rate": 2.7263333333333335e-06, + "loss": 0.0006, + "num_tokens": 547264.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 33.75925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.272762298583984, + "kl": 0.038841452449560165, + "learning_rate": 2.726e-06, + "loss": 0.3965, + "num_tokens": 547563.0, + "reward": 3.799999952316284, + "reward_std": 0.4000000059604645, + "rewards/reward_combined/mean": 3.799999952316284, + "rewards/reward_combined/std": 0.4000000059604645, + "step": 1823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 33.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1787581741809845, + "kl": 0.06607088446617126, + "learning_rate": 2.7256666666666666e-06, + "loss": 0.0032, + "num_tokens": 547904.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 33.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037928588688373566, + "kl": 0.006824996671639383, + "learning_rate": 2.7253333333333334e-06, + "loss": 0.0004, + "num_tokens": 548174.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 33.81481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5614644885063171, + "kl": 0.058784357039257884, + "learning_rate": 2.725e-06, + "loss": 0.0031, + "num_tokens": 548464.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 33.833333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05880916118621826, + "kl": 0.0033947378396987915, + "learning_rate": 2.724666666666667e-06, + "loss": 0.0002, + "num_tokens": 548736.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 33.851851851851855, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.120548963546753, + "kl": 0.13346320390701294, + "learning_rate": 2.7243333333333333e-06, + "loss": -0.0513, + "num_tokens": 549037.0, + "reward": 5.75, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 2.598076105117798, + "step": 1828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 33.870370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08798273652791977, + "kl": 0.01510192733258009, + "learning_rate": 2.724e-06, + "loss": 0.0007, + "num_tokens": 549368.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 33.888888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018094424158334732, + "kl": 0.0017211210215464234, + "learning_rate": 2.7236666666666665e-06, + "loss": 0.0001, + "num_tokens": 549603.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 33.907407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009208236820995808, + "kl": 0.00010989755264745327, + "learning_rate": 2.7233333333333337e-06, + "loss": 0.0, + "num_tokens": 549873.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 33.925925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026512209326028824, + "kl": 0.0014691509422846138, + "learning_rate": 2.723e-06, + "loss": 0.0001, + "num_tokens": 550135.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 33.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.26817670464515686, + "kl": 0.04431943129748106, + "learning_rate": 2.722666666666667e-06, + "loss": 0.0022, + "num_tokens": 550435.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 33.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06412789970636368, + "kl": 0.004469543811865151, + "learning_rate": 2.7223333333333336e-06, + "loss": 0.0002, + "num_tokens": 550756.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.007462686393409967, + "clip_ratio/low_min": 0.007462686393409967, + "clip_ratio/region_mean": 0.007462686393409967, + "completion_length": 28.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 28.0, + "completions/mean_terminated_length": 28.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 33.98148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.906788349151611, + "kl": 0.07722705788910389, + "learning_rate": 2.722e-06, + "loss": 0.0279, + "num_tokens": 551104.0, + "reward": 3.625, + "reward_std": 2.75, + "rewards/reward_combined/mean": 3.625, + "rewards/reward_combined/std": 2.75, + "step": 1835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 34.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.915585517883301, + "kl": 0.06409088708460331, + "learning_rate": 2.7216666666666667e-06, + "loss": 0.1676, + "num_tokens": 551452.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 1836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 34.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027719201520085335, + "kl": 0.0017488243756815791, + "learning_rate": 2.7213333333333335e-06, + "loss": 0.0001, + "num_tokens": 551730.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 34.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05981970950961113, + "kl": 0.0038055373588576913, + "learning_rate": 2.7210000000000003e-06, + "loss": 0.0002, + "num_tokens": 552036.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 34.05555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.08851432800293, + "kl": 0.13051492274098564, + "learning_rate": 2.7206666666666667e-06, + "loss": -0.0264, + "num_tokens": 552255.0, + "reward": 2.375, + "reward_std": 1.8874585628509521, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 1.8874585628509521, + "step": 1839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 34.074074074074076, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.732242584228516, + "kl": 0.01689472608268261, + "learning_rate": 2.7203333333333334e-06, + "loss": 0.1354, + "num_tokens": 552505.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 1840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 34.092592592592595, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6987879276275635, + "kl": 0.031687792390584946, + "learning_rate": 2.72e-06, + "loss": 0.1352, + "num_tokens": 552854.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 1841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 34.111111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02235507033765316, + "kl": 0.0011891061440110207, + "learning_rate": 2.7196666666666666e-06, + "loss": 0.0001, + "num_tokens": 553182.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 34.129629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09800175577402115, + "kl": 0.007735051680356264, + "learning_rate": 2.7193333333333334e-06, + "loss": 0.0004, + "num_tokens": 553473.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 34.148148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05617184564471245, + "kl": 0.008804846089333296, + "learning_rate": 2.719e-06, + "loss": 0.0004, + "num_tokens": 553757.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.007352941203862429, + "clip_ratio/low_min": 0.007352941203862429, + "clip_ratio/region_mean": 0.007352941203862429, + "completion_length": 31.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 31.5, + "completions/mean_terminated_length": 31.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 34.166666666666664, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.230052947998047, + "kl": 0.012352522229775786, + "learning_rate": 2.718666666666667e-06, + "loss": -0.0113, + "num_tokens": 554111.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 1845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 34.18518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1942746937274933, + "kl": 0.03265456482768059, + "learning_rate": 2.7183333333333333e-06, + "loss": 0.0016, + "num_tokens": 554397.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 34.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028727499768137932, + "kl": 0.002476705703884363, + "learning_rate": 2.718e-06, + "loss": 0.0001, + "num_tokens": 554631.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 34.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02679610252380371, + "kl": 0.0014699590974487364, + "learning_rate": 2.7176666666666664e-06, + "loss": 0.0001, + "num_tokens": 554893.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 34.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2361757904291153, + "kl": 0.00960657000541687, + "learning_rate": 2.7173333333333336e-06, + "loss": 0.0006, + "num_tokens": 555109.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1849 + }, + { + "clip_ratio/high_max": 0.014705882407724857, + "clip_ratio/high_mean": 0.014705882407724857, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.014705882407724857, + "completion_length": 35.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 35.25, + "completions/mean_terminated_length": 35.25, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 34.25925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 13.724506378173828, + "kl": 0.8115830812603235, + "learning_rate": 2.717e-06, + "loss": 0.0364, + "num_tokens": 555486.0, + "reward": 2.0, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.0, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 1850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 34.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024522658437490463, + "kl": 0.0023338720202445984, + "learning_rate": 2.7166666666666668e-06, + "loss": 0.0001, + "num_tokens": 555696.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 34.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1523863673210144, + "kl": 0.0074703507125377655, + "learning_rate": 2.7163333333333336e-06, + "loss": 0.0004, + "num_tokens": 555956.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 34.31481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11143297702074051, + "kl": 0.01677517336793244, + "learning_rate": 2.716e-06, + "loss": 0.0009, + "num_tokens": 556228.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 34.333333333333336, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.107197284698486, + "kl": 0.28561924397945404, + "learning_rate": 2.7156666666666667e-06, + "loss": -0.019, + "num_tokens": 556530.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 1854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 34.351851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0416756235063076, + "kl": 0.007633190951310098, + "learning_rate": 2.7153333333333335e-06, + "loss": 0.0004, + "num_tokens": 556800.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 34.370370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1109652891755104, + "kl": 0.022345018573105335, + "learning_rate": 2.7150000000000003e-06, + "loss": 0.0011, + "num_tokens": 557132.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 34.388888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20400479435920715, + "kl": 0.06068544089794159, + "learning_rate": 2.7146666666666666e-06, + "loss": 0.003, + "num_tokens": 557432.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 34.407407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060002490878105164, + "kl": 0.007621090626344085, + "learning_rate": 2.7143333333333334e-06, + "loss": 0.0004, + "num_tokens": 557704.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 34.425925925925924, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.0465216636657715, + "kl": 0.04501319723203778, + "learning_rate": 2.7139999999999998e-06, + "loss": 0.2299, + "num_tokens": 557996.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 1859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 34.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043118901550769806, + "kl": 0.0008434891351498663, + "learning_rate": 2.7136666666666665e-06, + "loss": 0.0, + "num_tokens": 558252.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 34.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002917917910963297, + "kl": 3.408889097045176e-05, + "learning_rate": 2.7133333333333338e-06, + "loss": 0.0, + "num_tokens": 558524.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.0, + "completions/max_terminated_length": 46.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 34.48148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8276538848876953, + "kl": 0.10735499858856201, + "learning_rate": 2.713e-06, + "loss": 0.1364, + "num_tokens": 558877.0, + "reward": 5.375, + "reward_std": 4.25, + "rewards/reward_combined/mean": 5.375, + "rewards/reward_combined/std": 4.25, + "step": 1862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 34.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027922697365283966, + "kl": 0.0015048664063215256, + "learning_rate": 2.712666666666667e-06, + "loss": 0.0001, + "num_tokens": 559189.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 34.51851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1018409729003906, + "kl": 0.08543189987540245, + "learning_rate": 2.7123333333333333e-06, + "loss": 0.0513, + "num_tokens": 559504.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 1864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 34.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1569334864616394, + "kl": 0.015963513404130936, + "learning_rate": 2.712e-06, + "loss": 0.0008, + "num_tokens": 559794.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 34.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4369741380214691, + "kl": 0.08534810319542885, + "learning_rate": 2.711666666666667e-06, + "loss": 0.0043, + "num_tokens": 560063.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 34.574074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0688805803656578, + "kl": 0.04286644235253334, + "learning_rate": 2.7113333333333336e-06, + "loss": 0.0021, + "num_tokens": 560391.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 34.592592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010634365491569042, + "kl": 0.009167622774839401, + "learning_rate": 2.711e-06, + "loss": 0.0005, + "num_tokens": 560663.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 34.611111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007116298656910658, + "kl": 0.015543228946626186, + "learning_rate": 2.7106666666666667e-06, + "loss": 0.0008, + "num_tokens": 560923.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 34.629629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08665429055690765, + "kl": 0.04061730671674013, + "learning_rate": 2.7103333333333335e-06, + "loss": 0.0021, + "num_tokens": 561221.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 34.648148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05451120436191559, + "kl": 0.1576053649187088, + "learning_rate": 2.71e-06, + "loss": 0.0079, + "num_tokens": 561532.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 34.666666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07285292446613312, + "kl": 0.006827495992183685, + "learning_rate": 2.7096666666666667e-06, + "loss": 0.0003, + "num_tokens": 561748.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 34.68518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004154373425990343, + "kl": 0.00019634515047073364, + "learning_rate": 2.7093333333333335e-06, + "loss": 0.0, + "num_tokens": 561968.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 1873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 34.5, + "completions/mean_terminated_length": 34.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 34.7037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.073772668838501, + "kl": 0.03586939349770546, + "learning_rate": 2.7090000000000002e-06, + "loss": 0.0533, + "num_tokens": 562386.0, + "reward": 2.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 0.25, + "step": 1874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 34.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00695054093375802, + "kl": 0.0025257617235183716, + "learning_rate": 2.7086666666666666e-06, + "loss": 0.0001, + "num_tokens": 562622.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 34.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03961246460676193, + "kl": 0.003956240834668279, + "learning_rate": 2.7083333333333334e-06, + "loss": 0.0002, + "num_tokens": 562918.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 34.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017291797557845712, + "kl": 0.001061158487573266, + "learning_rate": 2.7079999999999997e-06, + "loss": 0.0001, + "num_tokens": 563198.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 34.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2501739263534546, + "kl": 0.037852637469768524, + "learning_rate": 2.707666666666667e-06, + "loss": 0.0019, + "num_tokens": 563519.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 34.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1451413482427597, + "kl": 0.01997746340930462, + "learning_rate": 2.7073333333333337e-06, + "loss": 0.001, + "num_tokens": 563783.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 34.81481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04497898742556572, + "kl": 0.0004963263927493244, + "learning_rate": 2.707e-06, + "loss": 0.0, + "num_tokens": 563996.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 34.833333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04191657528281212, + "kl": 0.004467744147405028, + "learning_rate": 2.706666666666667e-06, + "loss": 0.0002, + "num_tokens": 564286.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 34.851851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.31732162833213806, + "kl": 0.026654242421500385, + "learning_rate": 2.7063333333333332e-06, + "loss": 0.0014, + "num_tokens": 564586.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 34.870370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8125565648078918, + "kl": 0.08514676988124847, + "learning_rate": 2.706e-06, + "loss": 0.0042, + "num_tokens": 564920.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 34.888888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11653347313404083, + "kl": 0.006543122231960297, + "learning_rate": 2.705666666666667e-06, + "loss": 0.0003, + "num_tokens": 565186.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 34.907407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048593275249004364, + "kl": 0.0035750133683905005, + "learning_rate": 2.7053333333333336e-06, + "loss": 0.0002, + "num_tokens": 565470.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 34.925925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16356879472732544, + "kl": 0.030564725399017334, + "learning_rate": 2.705e-06, + "loss": 0.0015, + "num_tokens": 565766.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 88.5, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 88.5, + "completions/mean_terminated_length": 32.66666793823242, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 34.94444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4624722003936768, + "kl": 0.0702102892100811, + "learning_rate": 2.7046666666666667e-06, + "loss": 0.3952, + "num_tokens": 566336.0, + "reward": 1.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 1.75, + "rewards/reward_combined/std": 3.5, + "step": 1887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 34.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02429959364235401, + "kl": 0.09237717092037201, + "learning_rate": 2.7043333333333335e-06, + "loss": 0.0046, + "num_tokens": 566702.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 34.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04286166653037071, + "kl": 0.003630565945059061, + "learning_rate": 2.704e-06, + "loss": 0.0002, + "num_tokens": 567025.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 35.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11360520124435425, + "kl": 0.05920999124646187, + "learning_rate": 2.7036666666666666e-06, + "loss": 0.0031, + "num_tokens": 567334.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 35.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05320507660508156, + "kl": 0.002864798763766885, + "learning_rate": 2.7033333333333334e-06, + "loss": 0.0001, + "num_tokens": 567656.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 35.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020746750757098198, + "kl": 0.0024244025407824665, + "learning_rate": 2.703e-06, + "loss": 0.0001, + "num_tokens": 567952.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 35.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014514111913740635, + "kl": 0.0015230292920023203, + "learning_rate": 2.7026666666666666e-06, + "loss": 0.0001, + "num_tokens": 568236.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 35.074074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017323600128293037, + "kl": 0.0011899081291630864, + "learning_rate": 2.7023333333333334e-06, + "loss": 0.0001, + "num_tokens": 568498.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 35.092592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1588435173034668, + "kl": 0.010516722686588764, + "learning_rate": 2.7019999999999997e-06, + "loss": 0.0005, + "num_tokens": 568800.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 35.111111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.33706891536712646, + "kl": 0.029557883739471436, + "learning_rate": 2.701666666666667e-06, + "loss": 0.0014, + "num_tokens": 569035.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 35.129629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1962471902370453, + "kl": 0.04810686968266964, + "learning_rate": 2.7013333333333337e-06, + "loss": 0.0024, + "num_tokens": 569303.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 35.148148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039578065276145935, + "kl": 0.0042112350929528475, + "learning_rate": 2.701e-06, + "loss": 0.0002, + "num_tokens": 569592.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 35.166666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02448314242064953, + "kl": 0.0009404495358467102, + "learning_rate": 2.700666666666667e-06, + "loss": 0.0, + "num_tokens": 569852.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 35.18518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007747381925582886, + "kl": 0.003932915162295103, + "learning_rate": 2.700333333333333e-06, + "loss": 0.0002, + "num_tokens": 570120.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 35.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.33407706022262573, + "kl": 0.02135928813368082, + "learning_rate": 2.7e-06, + "loss": 0.0011, + "num_tokens": 570389.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 35.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001607077312655747, + "kl": 0.001033989479765296, + "learning_rate": 2.6996666666666668e-06, + "loss": 0.0001, + "num_tokens": 570669.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 35.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026151256635785103, + "kl": 0.001922984141856432, + "learning_rate": 2.6993333333333335e-06, + "loss": 0.0001, + "num_tokens": 570949.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 35.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0030562663450837135, + "kl": 0.0001345425844192505, + "learning_rate": 2.699e-06, + "loss": 0.0, + "num_tokens": 571169.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 1904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 35.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04828093945980072, + "kl": 0.0015800580149516463, + "learning_rate": 2.6986666666666667e-06, + "loss": 0.0001, + "num_tokens": 571439.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 35.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007243466097861528, + "kl": 0.002480931580066681, + "learning_rate": 2.6983333333333335e-06, + "loss": 0.0001, + "num_tokens": 571675.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 35.31481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10791655629873276, + "kl": 0.011090089567005634, + "learning_rate": 2.698e-06, + "loss": 0.0006, + "num_tokens": 572000.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 35.333333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04971334710717201, + "kl": 0.008720860816538334, + "learning_rate": 2.697666666666667e-06, + "loss": 0.0004, + "num_tokens": 572348.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 35.351851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026712121441960335, + "kl": 0.001853201538324356, + "learning_rate": 2.6973333333333334e-06, + "loss": 0.0001, + "num_tokens": 572592.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.0, + "completions/max_terminated_length": 46.0, + "completions/mean_length": 34.5, + "completions/mean_terminated_length": 34.5, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 35.370370370370374, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.464919328689575, + "kl": 0.044607602059841156, + "learning_rate": 2.697e-06, + "loss": -0.0065, + "num_tokens": 573010.0, + "reward": 2.375, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 1.25, + "step": 1910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 35.388888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0087440125644207, + "kl": 0.0016347132623195648, + "learning_rate": 2.6966666666666665e-06, + "loss": 0.0001, + "num_tokens": 573322.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 5.25, + "completions/mean_terminated_length": 5.25, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 35.407407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6041319966316223, + "kl": 0.046095360070466995, + "learning_rate": 2.6963333333333333e-06, + "loss": 0.0025, + "num_tokens": 573543.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 35.425925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12174998968839645, + "kl": 0.05907886102795601, + "learning_rate": 2.696e-06, + "loss": 0.003, + "num_tokens": 573875.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 31.75, + "completions/mean_terminated_length": 31.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 35.44444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1315929889678955, + "kl": 0.03039420396089554, + "learning_rate": 2.695666666666667e-06, + "loss": 0.0101, + "num_tokens": 574226.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 1914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 35.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029044976457953453, + "kl": 0.0014936476945877075, + "learning_rate": 2.6953333333333337e-06, + "loss": 0.0001, + "num_tokens": 574436.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 35.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0063663870096206665, + "kl": 0.0009174628066830337, + "learning_rate": 2.695e-06, + "loss": 0.0, + "num_tokens": 574748.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 29.75, + "completions/mean_terminated_length": 29.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 35.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.5420122146606445, + "kl": 0.10465467721223831, + "learning_rate": 2.694666666666667e-06, + "loss": -0.0478, + "num_tokens": 575083.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 1917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 30.75, + "completions/mean_terminated_length": 30.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 35.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043420203030109406, + "kl": 0.00966458348557353, + "learning_rate": 2.694333333333333e-06, + "loss": 0.0005, + "num_tokens": 575434.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 35.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03351733461022377, + "kl": 0.003094971179962158, + "learning_rate": 2.694e-06, + "loss": 0.0002, + "num_tokens": 575646.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 35.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11436270922422409, + "kl": 0.003960305359214544, + "learning_rate": 2.6936666666666667e-06, + "loss": 0.0002, + "num_tokens": 575902.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 35.574074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16186094284057617, + "kl": 0.04600801132619381, + "learning_rate": 2.6933333333333335e-06, + "loss": 0.0023, + "num_tokens": 576234.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 35.592592592592595, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6478888988494873, + "kl": 0.02407922176644206, + "learning_rate": 2.693e-06, + "loss": -0.0327, + "num_tokens": 576537.0, + "reward": 4.5, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 4.041451930999756, + "step": 1922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 35.611111111111114, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.309637546539307, + "kl": 0.052644552662968636, + "learning_rate": 2.6926666666666667e-06, + "loss": -0.0156, + "num_tokens": 576871.0, + "reward": 5.375, + "reward_std": 2.462214469909668, + "rewards/reward_combined/mean": 5.375, + "rewards/reward_combined/std": 2.462214469909668, + "step": 1923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 24.25, + "completions/mean_terminated_length": 24.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 35.629629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12051546573638916, + "kl": 0.10289335250854492, + "learning_rate": 2.6923333333333334e-06, + "loss": 0.0052, + "num_tokens": 577236.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 35.648148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.068525031208992, + "kl": 0.003370234277099371, + "learning_rate": 2.692e-06, + "loss": 0.0002, + "num_tokens": 577496.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 35.666666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11714453995227814, + "kl": 0.0013956725597381592, + "learning_rate": 2.691666666666667e-06, + "loss": 0.0001, + "num_tokens": 577708.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 35.68518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.273424357175827, + "kl": 0.03037982527166605, + "learning_rate": 2.6913333333333334e-06, + "loss": 0.0016, + "num_tokens": 578044.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1927 + }, + { + "clip_ratio/high_max": 0.009803921915590763, + "clip_ratio/high_mean": 0.009803921915590763, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009803921915590763, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 35.7037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.6420745849609375, + "kl": 0.3156106173992157, + "learning_rate": 2.691e-06, + "loss": -0.1286, + "num_tokens": 578351.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 1928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 35.72222222222222, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4298832416534424, + "kl": 0.04664428532123566, + "learning_rate": 2.6906666666666665e-06, + "loss": 0.0013, + "num_tokens": 578695.0, + "reward": 4.0, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 4.041451930999756, + "step": 1929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 35.74074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.519260883331299, + "kl": 0.029957876540720463, + "learning_rate": 2.6903333333333333e-06, + "loss": 0.1924, + "num_tokens": 578983.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 1930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 35.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12322692573070526, + "kl": 0.024027224630117416, + "learning_rate": 2.69e-06, + "loss": 0.0012, + "num_tokens": 579280.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 26.75, + "completions/mean_terminated_length": 26.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 35.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1092136949300766, + "kl": 0.047265585511922836, + "learning_rate": 2.689666666666667e-06, + "loss": 0.002, + "num_tokens": 579607.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 35.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02424030937254429, + "kl": 0.005536033306270838, + "learning_rate": 2.6893333333333336e-06, + "loss": 0.0003, + "num_tokens": 579875.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 35.81481481481482, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.863023281097412, + "kl": 0.03910275222733617, + "learning_rate": 2.689e-06, + "loss": 0.0251, + "num_tokens": 580149.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 1934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 35.833333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06410113722085953, + "kl": 0.1623278483748436, + "learning_rate": 2.6886666666666668e-06, + "loss": 0.0081, + "num_tokens": 580457.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 35.851851851851855, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.260194301605225, + "kl": 0.0674049761146307, + "learning_rate": 2.688333333333333e-06, + "loss": 0.0243, + "num_tokens": 580764.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 1936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 35.870370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06127078831195831, + "kl": 0.007338077761232853, + "learning_rate": 2.688e-06, + "loss": 0.0004, + "num_tokens": 581054.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 35.888888888888886, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.13819694519043, + "kl": 0.21109507232904434, + "learning_rate": 2.6876666666666667e-06, + "loss": 0.1245, + "num_tokens": 581367.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 1938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 35.907407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005136251449584961, + "kl": 0.015850027091801167, + "learning_rate": 2.6873333333333335e-06, + "loss": 0.0008, + "num_tokens": 581627.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 35.925925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03257406875491142, + "kl": 0.01321916887536645, + "learning_rate": 2.6870000000000003e-06, + "loss": 0.0007, + "num_tokens": 581911.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 35.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14937995374202728, + "kl": 0.04119663592427969, + "learning_rate": 2.6866666666666666e-06, + "loss": 0.0021, + "num_tokens": 582215.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 35.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2690214514732361, + "kl": 0.04237298294901848, + "learning_rate": 2.6863333333333334e-06, + "loss": 0.002, + "num_tokens": 582497.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 12.25, + "completions/mean_terminated_length": 12.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 35.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17558544874191284, + "kl": 0.018677019514143467, + "learning_rate": 2.686e-06, + "loss": 0.001, + "num_tokens": 582766.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 36.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0075008501298725605, + "kl": 0.0024456456303596497, + "learning_rate": 2.685666666666667e-06, + "loss": 0.0001, + "num_tokens": 582982.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 36.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01434825174510479, + "kl": 0.26557381451129913, + "learning_rate": 2.6853333333333333e-06, + "loss": 0.0133, + "num_tokens": 583286.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 27.5, + "completions/mean_terminated_length": 27.5, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 36.03703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.18107795715332, + "kl": 0.14554791525006294, + "learning_rate": 2.685e-06, + "loss": 0.0069, + "num_tokens": 583624.0, + "reward": 5.25, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 2.598076105117798, + "step": 1946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 36.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032823607325553894, + "kl": 0.0010624155402183533, + "learning_rate": 2.6846666666666665e-06, + "loss": 0.0001, + "num_tokens": 583832.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 36.074074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031030649319291115, + "kl": 0.0019805729389190674, + "learning_rate": 2.6843333333333333e-06, + "loss": 0.0001, + "num_tokens": 584044.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 36.092592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004425667691975832, + "kl": 0.01601268444210291, + "learning_rate": 2.684e-06, + "loss": 0.0008, + "num_tokens": 584304.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 33.5, + "completions/mean_terminated_length": 33.5, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 36.111111111111114, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6444244384765625, + "kl": 0.07742475718259811, + "learning_rate": 2.683666666666667e-06, + "loss": 0.1037, + "num_tokens": 584674.0, + "reward": 3.375, + "reward_std": 0.25, + "rewards/reward_combined/mean": 3.375, + "rewards/reward_combined/std": 0.25, + "step": 1950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 36.129629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7247812151908875, + "kl": 0.08049389312509447, + "learning_rate": 2.6833333333333336e-06, + "loss": 0.004, + "num_tokens": 584958.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 36.148148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03097403421998024, + "kl": 0.013565556146204472, + "learning_rate": 2.683e-06, + "loss": 0.0007, + "num_tokens": 585242.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 36.166666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04462660476565361, + "kl": 0.013474771287292242, + "learning_rate": 2.6826666666666668e-06, + "loss": 0.0007, + "num_tokens": 585514.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 36.18518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006568982731550932, + "kl": 0.0008408394933212548, + "learning_rate": 2.682333333333333e-06, + "loss": 0.0, + "num_tokens": 585776.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 36.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05752872675657272, + "kl": 0.0017148196493508294, + "learning_rate": 2.6820000000000003e-06, + "loss": 0.0001, + "num_tokens": 585995.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 36.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.113161101937294, + "kl": 0.007638789131306112, + "learning_rate": 2.6816666666666667e-06, + "loss": 0.0004, + "num_tokens": 586324.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 36.24074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6874308586120605, + "kl": 0.019324714317917824, + "learning_rate": 2.6813333333333335e-06, + "loss": -0.0336, + "num_tokens": 586656.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 1957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 36.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18057064712047577, + "kl": 0.029344365932047367, + "learning_rate": 2.6810000000000003e-06, + "loss": 0.0015, + "num_tokens": 586990.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 36.27777777777778, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.487419605255127, + "kl": 0.4813556857407093, + "learning_rate": 2.6806666666666666e-06, + "loss": 0.0373, + "num_tokens": 587259.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 1959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 36.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05987457558512688, + "kl": 0.002563178539276123, + "learning_rate": 2.6803333333333334e-06, + "loss": 0.0001, + "num_tokens": 587523.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.25, + "completions/mean_terminated_length": 6.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 36.31481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10670791566371918, + "kl": 0.003754036850295961, + "learning_rate": 2.68e-06, + "loss": 0.0002, + "num_tokens": 587756.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 8.25, + "completions/mean_terminated_length": 8.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 36.333333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17386090755462646, + "kl": 0.009071653243154287, + "learning_rate": 2.679666666666667e-06, + "loss": 0.0006, + "num_tokens": 587989.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 36.351851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7542484402656555, + "kl": 0.0691049792803824, + "learning_rate": 2.6793333333333333e-06, + "loss": 0.0042, + "num_tokens": 588305.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 36.370370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03952301666140556, + "kl": 0.00029768794775009155, + "learning_rate": 2.679e-06, + "loss": 0.0, + "num_tokens": 588517.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 36.388888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08086838573217392, + "kl": 0.003991584060713649, + "learning_rate": 2.6786666666666665e-06, + "loss": 0.0002, + "num_tokens": 588760.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 45.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 45.25, + "completions/mean_terminated_length": 45.25, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 36.407407407407405, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5372467041015625, + "kl": 0.04116313345730305, + "learning_rate": 2.6783333333333332e-06, + "loss": 0.0096, + "num_tokens": 589165.0, + "reward": 5.0, + "reward_std": 5.0, + "rewards/reward_combined/mean": 5.0, + "rewards/reward_combined/std": 5.0, + "step": 1966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 36.425925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014552439097315073, + "kl": 4.976242780685425e-05, + "learning_rate": 2.678e-06, + "loss": 0.0, + "num_tokens": 589385.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 1967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 36.44444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.461563587188721, + "kl": 0.012471605325117707, + "learning_rate": 2.677666666666667e-06, + "loss": 0.0003, + "num_tokens": 589657.0, + "reward": 4.5, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 4.041451930999756, + "step": 1968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 36.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10504665970802307, + "kl": 0.017772881779819727, + "learning_rate": 2.6773333333333336e-06, + "loss": 0.0009, + "num_tokens": 589980.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 36.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023820899426937103, + "kl": 0.09272093325853348, + "learning_rate": 2.677e-06, + "loss": 0.0046, + "num_tokens": 590346.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 36.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011626126943156123, + "kl": 0.0012381459819152951, + "learning_rate": 2.6766666666666667e-06, + "loss": 0.0001, + "num_tokens": 590626.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 36.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.071332186460495, + "kl": 0.007835924974642694, + "learning_rate": 2.676333333333333e-06, + "loss": 0.0004, + "num_tokens": 590953.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 36.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003008150262758136, + "kl": 0.0004336945712566376, + "learning_rate": 2.6760000000000003e-06, + "loss": 0.0, + "num_tokens": 591213.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 36.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05171915888786316, + "kl": 0.009745566640049219, + "learning_rate": 2.6756666666666667e-06, + "loss": 0.0005, + "num_tokens": 591518.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 36.574074074074076, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.0136566162109375, + "kl": 0.1905686825630255, + "learning_rate": 2.6753333333333334e-06, + "loss": 0.1166, + "num_tokens": 591784.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 1975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 36.592592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009792163968086243, + "kl": 0.004565859213471413, + "learning_rate": 2.6750000000000002e-06, + "loss": 0.0002, + "num_tokens": 592052.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 36.611111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08396516740322113, + "kl": 0.005421443609520793, + "learning_rate": 2.6746666666666666e-06, + "loss": 0.0003, + "num_tokens": 592346.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 36.629629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006908867042511702, + "kl": 0.0019788509234786034, + "learning_rate": 2.6743333333333334e-06, + "loss": 0.0001, + "num_tokens": 592658.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 32.75, + "completions/mean_terminated_length": 32.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 36.648148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3667496144771576, + "kl": 0.11709627509117126, + "learning_rate": 2.674e-06, + "loss": 0.0055, + "num_tokens": 593005.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 23.75, + "completions/mean_terminated_length": 23.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 36.666666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3655175268650055, + "kl": 0.08529950305819511, + "learning_rate": 2.673666666666667e-06, + "loss": 0.0041, + "num_tokens": 593320.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 36.68518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02950194478034973, + "kl": 0.1593296006321907, + "learning_rate": 2.6733333333333333e-06, + "loss": 0.008, + "num_tokens": 593628.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 36.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028065474703907967, + "kl": 0.0034218335058540106, + "learning_rate": 2.673e-06, + "loss": 0.0002, + "num_tokens": 593924.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 30.75, + "completions/mean_terminated_length": 30.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 36.72222222222222, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.274560928344727, + "kl": 0.10706992074847221, + "learning_rate": 2.6726666666666664e-06, + "loss": 0.0407, + "num_tokens": 594299.0, + "reward": 3.0, + "reward_std": 3.188521146774292, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 3.188521146774292, + "step": 1983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 36.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009605297818779945, + "kl": 0.0004290342330932617, + "learning_rate": 2.6723333333333332e-06, + "loss": 0.0, + "num_tokens": 594571.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 36.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045539434999227524, + "kl": 0.009111804887652397, + "learning_rate": 2.6720000000000004e-06, + "loss": 0.0005, + "num_tokens": 594863.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 36.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08162450790405273, + "kl": 0.00872807833366096, + "learning_rate": 2.6716666666666668e-06, + "loss": 0.0004, + "num_tokens": 595157.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 36.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051039643585681915, + "kl": 0.002916533630923368, + "learning_rate": 2.6713333333333336e-06, + "loss": 0.0001, + "num_tokens": 595435.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 36.81481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.9727557301521301, + "kl": 0.08859988860785961, + "learning_rate": 2.671e-06, + "loss": 0.0043, + "num_tokens": 595730.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 36.833333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006847368087619543, + "kl": 0.01034638099372387, + "learning_rate": 2.6706666666666667e-06, + "loss": 0.0005, + "num_tokens": 596002.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 36.851851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008849128149449825, + "kl": 0.0020621493458747864, + "learning_rate": 2.6703333333333335e-06, + "loss": 0.0001, + "num_tokens": 596238.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 1990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 36.870370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15213976800441742, + "kl": 0.0546210166066885, + "learning_rate": 2.6700000000000003e-06, + "loss": 0.0027, + "num_tokens": 596542.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 36.888888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07517503201961517, + "kl": 0.010591336991637945, + "learning_rate": 2.6696666666666666e-06, + "loss": 0.0005, + "num_tokens": 596876.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 36.907407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.36763033270835876, + "kl": 0.04920933814719319, + "learning_rate": 2.6693333333333334e-06, + "loss": 0.0021, + "num_tokens": 597157.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 36.925925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14691144227981567, + "kl": 0.018337004352360964, + "learning_rate": 2.669e-06, + "loss": 0.0009, + "num_tokens": 597440.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 36.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005945276468992233, + "kl": 0.0003785049048019573, + "learning_rate": 2.6686666666666666e-06, + "loss": 0.0, + "num_tokens": 597754.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 1995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 36.96296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0691694021224976, + "kl": 0.10933673195540905, + "learning_rate": 2.6683333333333333e-06, + "loss": 0.0126, + "num_tokens": 598159.0, + "reward": 1.625, + "reward_std": 1.6007810831069946, + "rewards/reward_combined/mean": 1.625, + "rewards/reward_combined/std": 1.6007810831069946, + "step": 1996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 36.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1535937339067459, + "kl": 0.05911637470126152, + "learning_rate": 2.668e-06, + "loss": 0.003, + "num_tokens": 598462.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 1997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 37.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0974106639623642, + "kl": 0.004137328127399087, + "learning_rate": 2.667666666666667e-06, + "loss": 0.0002, + "num_tokens": 598718.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 1998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 33.0, + "completions/mean_terminated_length": 33.0, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 37.01851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.367629528045654, + "kl": 0.09553771838545799, + "learning_rate": 2.6673333333333333e-06, + "loss": -0.0078, + "num_tokens": 599066.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 1999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 37.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018481021746993065, + "kl": 0.004939146805554628, + "learning_rate": 2.667e-06, + "loss": 0.0002, + "num_tokens": 599336.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 37.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006614015903323889, + "kl": 0.0011533379438333213, + "learning_rate": 2.6666666666666664e-06, + "loss": 0.0001, + "num_tokens": 599596.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 37.074074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18544217944145203, + "kl": 0.03620108962059021, + "learning_rate": 2.6663333333333336e-06, + "loss": 0.0018, + "num_tokens": 599901.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 37.092592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01268834713846445, + "kl": 0.0005764476954936981, + "learning_rate": 2.6660000000000004e-06, + "loss": 0.0, + "num_tokens": 600161.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 36.5, + "completions/mean_terminated_length": 36.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 37.111111111111114, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1501883268356323, + "kl": 0.02275223797187209, + "learning_rate": 2.6656666666666668e-06, + "loss": -0.0506, + "num_tokens": 600587.0, + "reward": 2.049999952316284, + "reward_std": 1.4177446365356445, + "rewards/reward_combined/mean": 2.049999952316284, + "rewards/reward_combined/std": 1.417744755744934, + "step": 2004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 37.129629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006446678191423416, + "kl": 0.00017789999401429668, + "learning_rate": 2.6653333333333335e-06, + "loss": 0.0, + "num_tokens": 600859.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 11.75, + "completions/mean_terminated_length": 11.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 37.148148148148145, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.847894191741943, + "kl": 0.031437797006219625, + "learning_rate": 2.665e-06, + "loss": 0.1088, + "num_tokens": 601134.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 2006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 37.166666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007883016020059586, + "kl": 0.0006027668714523315, + "learning_rate": 2.6646666666666667e-06, + "loss": 0.0, + "num_tokens": 601350.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 37.18518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03074074164032936, + "kl": 0.0016907327226363122, + "learning_rate": 2.6643333333333335e-06, + "loss": 0.0001, + "num_tokens": 601619.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 37.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07490681856870651, + "kl": 0.01436805771663785, + "learning_rate": 2.6640000000000002e-06, + "loss": 0.0007, + "num_tokens": 601910.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 37.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03064054436981678, + "kl": 0.0018362122355028987, + "learning_rate": 2.6636666666666666e-06, + "loss": 0.0001, + "num_tokens": 602192.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 37.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14514805376529694, + "kl": 0.01454864488914609, + "learning_rate": 2.6633333333333334e-06, + "loss": 0.0008, + "num_tokens": 602500.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 26.25, + "completions/mean_terminated_length": 26.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 37.25925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6526553630828857, + "kl": 0.015236596576869488, + "learning_rate": 2.663e-06, + "loss": 0.0237, + "num_tokens": 602833.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 2012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 37.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005322328768670559, + "kl": 0.00038795835280325264, + "learning_rate": 2.6626666666666665e-06, + "loss": 0.0, + "num_tokens": 603145.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 37.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013279959559440613, + "kl": 0.0037963627837598324, + "learning_rate": 2.6623333333333333e-06, + "loss": 0.0002, + "num_tokens": 603405.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 37.31481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011032840237021446, + "kl": 0.002308901399374008, + "learning_rate": 2.662e-06, + "loss": 0.0001, + "num_tokens": 603717.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 37.333333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01130884513258934, + "kl": 0.26621611416339874, + "learning_rate": 2.661666666666667e-06, + "loss": 0.0133, + "num_tokens": 604021.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 37.351851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02242274582386017, + "kl": 0.0009990260004997253, + "learning_rate": 2.6613333333333332e-06, + "loss": 0.0, + "num_tokens": 604231.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 37.370370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055110346525907516, + "kl": 0.0056349122896790504, + "learning_rate": 2.661e-06, + "loss": 0.0003, + "num_tokens": 604522.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 37.388888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006918244995176792, + "kl": 0.010301381349563599, + "learning_rate": 2.6606666666666664e-06, + "loss": 0.0005, + "num_tokens": 604794.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 37.407407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010429744143038988, + "kl": 3.089010715484619e-05, + "learning_rate": 2.6603333333333336e-06, + "loss": 0.0, + "num_tokens": 605014.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 2020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 37.425925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01997925341129303, + "kl": 0.0079887006431818, + "learning_rate": 2.6600000000000004e-06, + "loss": 0.0004, + "num_tokens": 605320.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 37.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03852691873908043, + "kl": 0.0002607181668281555, + "learning_rate": 2.6596666666666667e-06, + "loss": 0.0, + "num_tokens": 605532.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 37.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09802347421646118, + "kl": 0.006271092686802149, + "learning_rate": 2.6593333333333335e-06, + "loss": 0.0003, + "num_tokens": 605788.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 37.48148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 11.044208526611328, + "kl": 0.015496873296797276, + "learning_rate": 2.659e-06, + "loss": 0.3791, + "num_tokens": 606029.0, + "reward": 2.75, + "reward_std": 2.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 2.5, + "step": 2024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 37.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004191862419247627, + "kl": 0.01603720895946026, + "learning_rate": 2.6586666666666667e-06, + "loss": 0.0008, + "num_tokens": 606289.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 37.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012551024556159973, + "kl": 0.0012371752527542412, + "learning_rate": 2.6583333333333334e-06, + "loss": 0.0001, + "num_tokens": 606569.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 37.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04891206696629524, + "kl": 0.012084125075489283, + "learning_rate": 2.6580000000000002e-06, + "loss": 0.0006, + "num_tokens": 606865.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 37.55555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.988032341003418, + "kl": 0.040377695113420486, + "learning_rate": 2.6576666666666666e-06, + "loss": 0.2248, + "num_tokens": 607241.0, + "reward": 4.875, + "reward_std": 5.25, + "rewards/reward_combined/mean": 4.875, + "rewards/reward_combined/std": 5.25, + "step": 2028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 37.574074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04607218876481056, + "kl": 0.00990409275982529, + "learning_rate": 2.6573333333333334e-06, + "loss": 0.0006, + "num_tokens": 607540.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 37.592592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.25264519453048706, + "kl": 0.07270743325352669, + "learning_rate": 2.657e-06, + "loss": 0.0036, + "num_tokens": 607860.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 37.611111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043640125542879105, + "kl": 0.0011769604461733252, + "learning_rate": 2.6566666666666665e-06, + "loss": 0.0001, + "num_tokens": 608094.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2031 + }, + { + "clip_ratio/high_max": 0.00909090880304575, + "clip_ratio/high_mean": 0.00909090880304575, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00909090880304575, + "completion_length": 32.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 32.5, + "completions/mean_terminated_length": 32.5, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 37.629629629629626, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.958291530609131, + "kl": 0.06803623959422112, + "learning_rate": 2.6563333333333337e-06, + "loss": 0.0536, + "num_tokens": 608452.0, + "reward": 6.625, + "reward_std": 2.0966243743896484, + "rewards/reward_combined/mean": 6.625, + "rewards/reward_combined/std": 2.0966243743896484, + "step": 2032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 37.648148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23634594678878784, + "kl": 0.015228984877467155, + "learning_rate": 2.656e-06, + "loss": 0.0008, + "num_tokens": 608785.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 37.666666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06777699291706085, + "kl": 0.13369496539235115, + "learning_rate": 2.655666666666667e-06, + "loss": 0.0068, + "num_tokens": 609091.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 37.68518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.30684855580329895, + "kl": 0.06815839000046253, + "learning_rate": 2.655333333333333e-06, + "loss": 0.0034, + "num_tokens": 609409.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 37.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19832812249660492, + "kl": 0.024977766908705235, + "learning_rate": 2.655e-06, + "loss": 0.0013, + "num_tokens": 609755.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 45.0, + "completions/max_terminated_length": 45.0, + "completions/mean_length": 34.75, + "completions/mean_terminated_length": 34.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 37.72222222222222, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5047805309295654, + "kl": 0.06455581076443195, + "learning_rate": 2.6546666666666668e-06, + "loss": 0.0054, + "num_tokens": 610118.0, + "reward": 5.25, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 2.598076105117798, + "step": 2037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 37.74074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.523073673248291, + "kl": 0.6824191145133227, + "learning_rate": 2.6543333333333336e-06, + "loss": 0.0371, + "num_tokens": 610414.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 2038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 37.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1935129463672638, + "kl": 0.0730351060628891, + "learning_rate": 2.6540000000000003e-06, + "loss": 0.0035, + "num_tokens": 610696.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 37.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021079430356621742, + "kl": 0.0008019383531063795, + "learning_rate": 2.6536666666666667e-06, + "loss": 0.0, + "num_tokens": 611013.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 37.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07596131414175034, + "kl": 0.0033614374697208405, + "learning_rate": 2.6533333333333335e-06, + "loss": 0.0002, + "num_tokens": 611291.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 40.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 40.0, + "completions/mean_terminated_length": 40.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 37.81481481481482, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9809070825576782, + "kl": 0.059764180332422256, + "learning_rate": 2.653e-06, + "loss": 0.0666, + "num_tokens": 611687.0, + "reward": 4.25, + "reward_std": 2.1794495582580566, + "rewards/reward_combined/mean": 4.25, + "rewards/reward_combined/std": 2.1794495582580566, + "step": 2042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 37.833333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01673242449760437, + "kl": 0.0010903941001743078, + "learning_rate": 2.6526666666666666e-06, + "loss": 0.0001, + "num_tokens": 611930.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 45.0, + "completions/max_terminated_length": 45.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 37.851851851851855, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.7295002937316895, + "kl": 0.021687767934054136, + "learning_rate": 2.6523333333333334e-06, + "loss": 0.3056, + "num_tokens": 612232.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 2044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 37.870370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02881363406777382, + "kl": 0.013736420311033726, + "learning_rate": 2.652e-06, + "loss": 0.0007, + "num_tokens": 612516.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 37.888888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035456810146570206, + "kl": 0.0930885374546051, + "learning_rate": 2.6516666666666665e-06, + "loss": 0.0047, + "num_tokens": 612880.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 37.907407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11515820771455765, + "kl": 0.024955608882009983, + "learning_rate": 2.6513333333333333e-06, + "loss": 0.0012, + "num_tokens": 613205.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 37.925925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0065927221439778805, + "kl": 0.0003466665802989155, + "learning_rate": 2.651e-06, + "loss": 0.0, + "num_tokens": 613425.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 37.94444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.033989429473877, + "kl": 0.21544765689759515, + "learning_rate": 2.6506666666666665e-06, + "loss": -0.0519, + "num_tokens": 613744.0, + "reward": 6.75, + "reward_std": 2.5, + "rewards/reward_combined/mean": 6.75, + "rewards/reward_combined/std": 2.5, + "step": 2049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 37.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046170637011528015, + "kl": 0.001148730458226055, + "learning_rate": 2.6503333333333337e-06, + "loss": 0.0001, + "num_tokens": 614000.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 5.75, + "completions/mean_terminated_length": 5.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 37.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.253532737493515, + "kl": 0.013601384125649929, + "learning_rate": 2.65e-06, + "loss": 0.0009, + "num_tokens": 614243.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 38.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6520941257476807, + "kl": 0.001956335734575987, + "learning_rate": 2.649666666666667e-06, + "loss": 0.0381, + "num_tokens": 614535.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 2052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 28.75, + "completions/mean_terminated_length": 28.75, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 38.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.49216076731681824, + "kl": 0.11486417427659035, + "learning_rate": 2.649333333333333e-06, + "loss": 0.0057, + "num_tokens": 614866.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 38.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19710887968540192, + "kl": 0.020232319831848145, + "learning_rate": 2.649e-06, + "loss": 0.001, + "num_tokens": 615194.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 38.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06992805004119873, + "kl": 0.007198390318080783, + "learning_rate": 2.6486666666666667e-06, + "loss": 0.0004, + "num_tokens": 615525.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 38.074074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14179585874080658, + "kl": 0.01871525961905718, + "learning_rate": 2.6483333333333335e-06, + "loss": 0.001, + "num_tokens": 615814.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 38.092592592592595, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.338315963745117, + "kl": 0.04318897798657417, + "learning_rate": 2.6480000000000003e-06, + "loss": -0.0093, + "num_tokens": 616146.0, + "reward": 4.375, + "reward_std": 2.0966243743896484, + "rewards/reward_combined/mean": 4.375, + "rewards/reward_combined/std": 2.0966243743896484, + "step": 2057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 38.111111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005700599867850542, + "kl": 0.00030153393163345754, + "learning_rate": 2.6476666666666667e-06, + "loss": 0.0, + "num_tokens": 616366.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2058 + }, + { + "clip_ratio/high_max": 0.011111111380159855, + "clip_ratio/high_mean": 0.011111111380159855, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.011111111380159855, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 38.129629629629626, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.507438659667969, + "kl": 0.05070340633392334, + "learning_rate": 2.6473333333333335e-06, + "loss": -0.0043, + "num_tokens": 616674.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 2059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 38.148148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05976412445306778, + "kl": 0.004106957232579589, + "learning_rate": 2.647e-06, + "loss": 0.0002, + "num_tokens": 616958.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 7.5, + "completions/mean_terminated_length": 7.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 38.166666666666664, + "frac_reward_zero_std": 0.0, + "grad_norm": 11.302739143371582, + "kl": 0.055559821776114404, + "learning_rate": 2.6466666666666666e-06, + "loss": 0.0901, + "num_tokens": 617196.0, + "reward": 3.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 3.875, + "rewards/reward_combined/std": 0.25, + "step": 2061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 38.18518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3871672451496124, + "kl": 0.05765043757855892, + "learning_rate": 2.6463333333333334e-06, + "loss": 0.0032, + "num_tokens": 617460.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 38.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01696948893368244, + "kl": 0.001366661163046956, + "learning_rate": 2.646e-06, + "loss": 0.0001, + "num_tokens": 617744.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 38.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002141333185136318, + "kl": 8.910894393920898e-05, + "learning_rate": 2.645666666666667e-06, + "loss": 0.0, + "num_tokens": 617964.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 2064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 38.24074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.7232794761657715, + "kl": 0.0037858079303987324, + "learning_rate": 2.6453333333333333e-06, + "loss": 0.1205, + "num_tokens": 618240.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 2065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 38.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001858091214671731, + "kl": 0.00010813176777446643, + "learning_rate": 2.645e-06, + "loss": 0.0, + "num_tokens": 618508.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 38.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23852196335792542, + "kl": 0.05729203671216965, + "learning_rate": 2.644666666666667e-06, + "loss": 0.0024, + "num_tokens": 618825.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 38.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02844509482383728, + "kl": 0.0005351453874027357, + "learning_rate": 2.6443333333333337e-06, + "loss": 0.0, + "num_tokens": 619081.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 38.31481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04656428098678589, + "kl": 0.007615548558533192, + "learning_rate": 2.644e-06, + "loss": 0.0004, + "num_tokens": 619383.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 38.333333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09220888465642929, + "kl": 0.006497216279967688, + "learning_rate": 2.643666666666667e-06, + "loss": 0.0003, + "num_tokens": 619662.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2070 + }, + { + "clip_ratio/high_max": 0.010204081423580647, + "clip_ratio/high_mean": 0.010204081423580647, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.010204081423580647, + "completion_length": 28.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 28.5, + "completions/mean_terminated_length": 28.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 38.351851851851855, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.1496195793151855, + "kl": 0.12165561318397522, + "learning_rate": 2.643333333333333e-06, + "loss": 0.0887, + "num_tokens": 620004.0, + "reward": 5.875, + "reward_std": 3.25, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 3.25, + "step": 2071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 38.370370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016272624488919973, + "kl": 0.0011427743011154234, + "learning_rate": 2.643e-06, + "loss": 0.0001, + "num_tokens": 620284.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 38.388888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24287305772304535, + "kl": 0.02667728951200843, + "learning_rate": 2.6426666666666667e-06, + "loss": 0.0013, + "num_tokens": 620538.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 38.407407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09146798402070999, + "kl": 0.002835690975189209, + "learning_rate": 2.6423333333333335e-06, + "loss": 0.0001, + "num_tokens": 620754.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 38.425925925925924, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.900708198547363, + "kl": 0.08538460358977318, + "learning_rate": 2.6420000000000003e-06, + "loss": -0.276, + "num_tokens": 621040.0, + "reward": 4.625, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 2075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 38.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4542900025844574, + "kl": 0.06759600341320038, + "learning_rate": 2.6416666666666666e-06, + "loss": 0.0033, + "num_tokens": 621341.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 38.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.31494176387786865, + "kl": 0.064749326556921, + "learning_rate": 2.6413333333333334e-06, + "loss": 0.0036, + "num_tokens": 621631.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 38.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009203692898154259, + "kl": 0.009390239603817463, + "learning_rate": 2.6409999999999998e-06, + "loss": 0.0005, + "num_tokens": 621903.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 38.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03841273859143257, + "kl": 0.007130143931135535, + "learning_rate": 2.640666666666667e-06, + "loss": 0.0004, + "num_tokens": 622247.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 38.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1198619157075882, + "kl": 0.021041665691882372, + "learning_rate": 2.6403333333333334e-06, + "loss": 0.0011, + "num_tokens": 622515.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 38.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03470019996166229, + "kl": 0.00427778234006837, + "learning_rate": 2.64e-06, + "loss": 0.0002, + "num_tokens": 622827.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 38.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.27020177245140076, + "kl": 0.053211357444524765, + "learning_rate": 2.639666666666667e-06, + "loss": 0.0027, + "num_tokens": 623129.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 38.574074074074076, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.43861722946167, + "kl": 0.8715322834905237, + "learning_rate": 2.6393333333333333e-06, + "loss": -0.0466, + "num_tokens": 623416.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 2083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 38.592592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008841919712722301, + "kl": 0.0005360543727874756, + "learning_rate": 2.639e-06, + "loss": 0.0, + "num_tokens": 623676.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 38.611111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006221949588507414, + "kl": 0.0009625107049942017, + "learning_rate": 2.638666666666667e-06, + "loss": 0.0, + "num_tokens": 623920.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 38.629629629629626, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.516137599945068, + "kl": 0.08168338239192963, + "learning_rate": 2.6383333333333336e-06, + "loss": 0.1445, + "num_tokens": 624205.0, + "reward": 4.5, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 4.041451930999756, + "step": 2086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 38.648148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015516391955316067, + "kl": 0.0010561671806499362, + "learning_rate": 2.638e-06, + "loss": 0.0001, + "num_tokens": 624467.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 31.75, + "completions/mean_terminated_length": 31.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 38.666666666666664, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3098702430725098, + "kl": 0.0630001462996006, + "learning_rate": 2.6376666666666668e-06, + "loss": 0.0585, + "num_tokens": 624818.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 2088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 38.68518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.34222495555877686, + "kl": 0.038062578067183495, + "learning_rate": 2.637333333333333e-06, + "loss": 0.0019, + "num_tokens": 625114.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 38.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004281165078282356, + "kl": 0.015956650488078594, + "learning_rate": 2.637e-06, + "loss": 0.0008, + "num_tokens": 625374.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 38.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02747010812163353, + "kl": 0.0013428330421447754, + "learning_rate": 2.6366666666666667e-06, + "loss": 0.0001, + "num_tokens": 625582.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 38.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03892965614795685, + "kl": 0.0028251956100575626, + "learning_rate": 2.6363333333333335e-06, + "loss": 0.0002, + "num_tokens": 625905.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 38.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01461059134453535, + "kl": 0.26530514657497406, + "learning_rate": 2.6360000000000003e-06, + "loss": 0.0133, + "num_tokens": 626209.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 33.25, + "completions/mean_terminated_length": 33.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 38.77777777777778, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.523496150970459, + "kl": 0.09386411216109991, + "learning_rate": 2.6356666666666666e-06, + "loss": -0.2074, + "num_tokens": 626562.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 2094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 38.7962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.259440898895264, + "kl": 0.08142408728599548, + "learning_rate": 2.6353333333333334e-06, + "loss": 0.0902, + "num_tokens": 626891.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 2095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 38.81481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00920241791754961, + "kl": 0.03917407616972923, + "learning_rate": 2.6349999999999998e-06, + "loss": 0.002, + "num_tokens": 627296.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 2096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 38.833333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03610505163669586, + "kl": 0.09293317049741745, + "learning_rate": 2.634666666666667e-06, + "loss": 0.0046, + "num_tokens": 627660.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 38.851851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07827702164649963, + "kl": 0.006866562878713012, + "learning_rate": 2.6343333333333333e-06, + "loss": 0.0003, + "num_tokens": 627964.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 38.870370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010970977135002613, + "kl": 0.002987690269947052, + "learning_rate": 2.634e-06, + "loss": 0.0001, + "num_tokens": 628180.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 38.888888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3584320843219757, + "kl": 0.19197433441877365, + "learning_rate": 2.633666666666667e-06, + "loss": 0.0096, + "num_tokens": 628490.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2100 + }, + { + "clip_ratio/high_max": 0.05000000074505806, + "clip_ratio/high_mean": 0.05000000074505806, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.05000000074505806, + "completion_length": 4.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.5, + "completions/mean_terminated_length": 4.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 38.907407407407405, + "frac_reward_zero_std": 0.0, + "grad_norm": 10.96440315246582, + "kl": 0.21618781238794327, + "learning_rate": 2.6333333333333332e-06, + "loss": 0.0969, + "num_tokens": 628728.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 2101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 38.925925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015474081970751286, + "kl": 0.004213389940559864, + "learning_rate": 2.633e-06, + "loss": 0.0002, + "num_tokens": 628998.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 38.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06896309554576874, + "kl": 0.012822144664824009, + "learning_rate": 2.632666666666667e-06, + "loss": 0.0006, + "num_tokens": 629328.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 38.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12541289627552032, + "kl": 0.002096331096254289, + "learning_rate": 2.6323333333333336e-06, + "loss": 0.0001, + "num_tokens": 629541.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 38.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007181167136877775, + "kl": 0.0016973447054624557, + "learning_rate": 2.632e-06, + "loss": 0.0001, + "num_tokens": 629853.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 39.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034228816628456116, + "kl": 0.009527456015348434, + "learning_rate": 2.6316666666666667e-06, + "loss": 0.0005, + "num_tokens": 630142.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 39.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0050581987015903, + "kl": 0.0013630688190460205, + "learning_rate": 2.631333333333333e-06, + "loss": 0.0001, + "num_tokens": 630454.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 39.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.28263208270072937, + "kl": 0.023534612730145454, + "learning_rate": 2.631e-06, + "loss": 0.0012, + "num_tokens": 630782.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 27.25, + "completions/mean_terminated_length": 27.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 39.05555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.529536008834839, + "kl": 0.04671902675181627, + "learning_rate": 2.630666666666667e-06, + "loss": 0.2356, + "num_tokens": 631111.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 2109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 39.074074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029026197269558907, + "kl": 0.0014323961222544312, + "learning_rate": 2.6303333333333334e-06, + "loss": 0.0001, + "num_tokens": 631346.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 39.092592592592595, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.125633239746094, + "kl": 0.06368220970034599, + "learning_rate": 2.6300000000000002e-06, + "loss": 0.1448, + "num_tokens": 631635.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 2111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 39.111111111111114, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.142145156860352, + "kl": 0.048971325159072876, + "learning_rate": 2.6296666666666666e-06, + "loss": 0.1058, + "num_tokens": 631908.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 2112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 39.129629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.45784685015678406, + "kl": 0.09902294422499835, + "learning_rate": 2.6293333333333334e-06, + "loss": 0.003, + "num_tokens": 632164.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 39.148148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060580141842365265, + "kl": 0.009040889330208302, + "learning_rate": 2.629e-06, + "loss": 0.0004, + "num_tokens": 632496.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 30.5, + "completions/mean_terminated_length": 30.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 39.166666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1341930478811264, + "kl": 0.037658074870705605, + "learning_rate": 2.628666666666667e-06, + "loss": 0.0021, + "num_tokens": 632834.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 39.18518518518518, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.046877861022949, + "kl": 0.3204140365123749, + "learning_rate": 2.6283333333333333e-06, + "loss": 0.0042, + "num_tokens": 633137.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 2116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 39.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1252264678478241, + "kl": 0.015165239572525024, + "learning_rate": 2.628e-06, + "loss": 0.0008, + "num_tokens": 633353.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 39.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028630902990698814, + "kl": 0.0017961161211133003, + "learning_rate": 2.627666666666667e-06, + "loss": 0.0001, + "num_tokens": 633665.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 39.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004429290071129799, + "kl": 0.015935112722218037, + "learning_rate": 2.6273333333333332e-06, + "loss": 0.0008, + "num_tokens": 633925.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 39.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06858514994382858, + "kl": 0.03680132422596216, + "learning_rate": 2.627e-06, + "loss": 0.0018, + "num_tokens": 634225.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 39.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05428372323513031, + "kl": 0.005785493645817041, + "learning_rate": 2.6266666666666668e-06, + "loss": 0.0003, + "num_tokens": 634481.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 39.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007947173900902271, + "kl": 0.0007086461409926414, + "learning_rate": 2.6263333333333336e-06, + "loss": 0.0, + "num_tokens": 634799.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 39.31481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01637076400220394, + "kl": 0.0038555373903363943, + "learning_rate": 2.626e-06, + "loss": 0.0002, + "num_tokens": 635067.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 39.333333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0065318383276462555, + "kl": 0.0008631125092506409, + "learning_rate": 2.6256666666666667e-06, + "loss": 0.0, + "num_tokens": 635311.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 23.25, + "completions/mean_terminated_length": 23.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 39.351851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20261795818805695, + "kl": 0.048456584103405476, + "learning_rate": 2.625333333333333e-06, + "loss": 0.0026, + "num_tokens": 635632.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 39.370370370370374, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.527511119842529, + "kl": 0.060799007973400876, + "learning_rate": 2.6250000000000003e-06, + "loss": -0.0385, + "num_tokens": 635910.0, + "reward": 6.75, + "reward_std": 2.5, + "rewards/reward_combined/mean": 6.75, + "rewards/reward_combined/std": 2.5, + "step": 2126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 39.388888888888886, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.544265270233154, + "kl": 0.20098943263292313, + "learning_rate": 2.624666666666667e-06, + "loss": -0.0376, + "num_tokens": 636244.0, + "reward": 5.25, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 2.598076105117798, + "step": 2127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 39.407407407407405, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.801536560058594, + "kl": 0.10604314506053925, + "learning_rate": 2.6243333333333334e-06, + "loss": -0.0168, + "num_tokens": 636555.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 2128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 39.425925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05629903823137283, + "kl": 0.006447185412980616, + "learning_rate": 2.624e-06, + "loss": 0.0003, + "num_tokens": 636837.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 39.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11607575416564941, + "kl": 0.00840452453121543, + "learning_rate": 2.6236666666666666e-06, + "loss": 0.0004, + "num_tokens": 637133.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 39.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019287265837192535, + "kl": 0.010870505589991808, + "learning_rate": 2.6233333333333333e-06, + "loss": 0.0005, + "num_tokens": 637401.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 39.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026004992425441742, + "kl": 0.0011187029886059463, + "learning_rate": 2.623e-06, + "loss": 0.0001, + "num_tokens": 637663.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 39.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04478120431303978, + "kl": 0.0016564875841140747, + "learning_rate": 2.622666666666667e-06, + "loss": 0.0001, + "num_tokens": 637923.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 39.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08575080335140228, + "kl": 0.005539353413041681, + "learning_rate": 2.6223333333333333e-06, + "loss": 0.0002, + "num_tokens": 638199.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 39.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4723203182220459, + "kl": 0.07787950336933136, + "learning_rate": 2.622e-06, + "loss": 0.0039, + "num_tokens": 638419.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 2135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 39.55555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.368828773498535, + "kl": 0.15016889572143555, + "learning_rate": 2.621666666666667e-06, + "loss": 0.1294, + "num_tokens": 638740.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 2136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 39.574074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004681735765188932, + "kl": 0.04500012286007404, + "learning_rate": 2.621333333333333e-06, + "loss": 0.0023, + "num_tokens": 639144.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 2137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 39.592592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04991491138935089, + "kl": 0.01225105207413435, + "learning_rate": 2.621e-06, + "loss": 0.0006, + "num_tokens": 639432.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 39.611111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21720410883426666, + "kl": 0.0040453895926475525, + "learning_rate": 2.6206666666666668e-06, + "loss": 0.0002, + "num_tokens": 639642.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 39.629629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.430679053068161, + "kl": 0.05039902962744236, + "learning_rate": 2.6203333333333335e-06, + "loss": 0.0025, + "num_tokens": 639932.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 24.25, + "completions/mean_terminated_length": 24.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 39.648148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2291373908519745, + "kl": 0.14086488634347916, + "learning_rate": 2.62e-06, + "loss": 0.0068, + "num_tokens": 640253.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 39.666666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20720158517360687, + "kl": 0.06418174505233765, + "learning_rate": 2.6196666666666667e-06, + "loss": 0.0032, + "num_tokens": 640578.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 39.68518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08042503893375397, + "kl": 0.010246471967548132, + "learning_rate": 2.619333333333333e-06, + "loss": 0.0005, + "num_tokens": 640902.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 39.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14660649001598358, + "kl": 0.028331005945801735, + "learning_rate": 2.6190000000000003e-06, + "loss": 0.0014, + "num_tokens": 641162.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 39.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006078194361180067, + "kl": 0.01085857953876257, + "learning_rate": 2.618666666666667e-06, + "loss": 0.0005, + "num_tokens": 641434.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 39.74074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1789475679397583, + "kl": 0.40612654387950897, + "learning_rate": 2.6183333333333334e-06, + "loss": 0.0053, + "num_tokens": 641729.0, + "reward": 4.0, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 4.041451930999756, + "step": 2146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 39.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3460330367088318, + "kl": 0.03149532899260521, + "learning_rate": 2.618e-06, + "loss": 0.0016, + "num_tokens": 642011.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 39.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.27375251054763794, + "kl": 0.045461490750312805, + "learning_rate": 2.6176666666666665e-06, + "loss": 0.0022, + "num_tokens": 642291.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 39.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02751186490058899, + "kl": 0.09434760734438896, + "learning_rate": 2.6173333333333333e-06, + "loss": 0.0047, + "num_tokens": 642655.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 39.81481481481482, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.853020668029785, + "kl": 0.04612906463444233, + "learning_rate": 2.617e-06, + "loss": 0.0276, + "num_tokens": 642951.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 2150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 26.25, + "completions/mean_terminated_length": 26.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 39.833333333333336, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.4463019371032715, + "kl": 0.05522099696099758, + "learning_rate": 2.616666666666667e-06, + "loss": 0.0967, + "num_tokens": 643292.0, + "reward": 3.75, + "reward_std": 2.723355770111084, + "rewards/reward_combined/mean": 3.75, + "rewards/reward_combined/std": 2.723355770111084, + "step": 2151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 39.851851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033128779381513596, + "kl": 0.00267578661441803, + "learning_rate": 2.6163333333333332e-06, + "loss": 0.0001, + "num_tokens": 643504.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 39.870370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040096163749694824, + "kl": 0.0011983886361122131, + "learning_rate": 2.616e-06, + "loss": 0.0001, + "num_tokens": 643772.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 39.888888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12236732244491577, + "kl": 0.02696285117417574, + "learning_rate": 2.615666666666667e-06, + "loss": 0.0013, + "num_tokens": 644072.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 39.907407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08776862174272537, + "kl": 0.012718722689896822, + "learning_rate": 2.615333333333333e-06, + "loss": 0.0006, + "num_tokens": 644372.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 39.925925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022265596315264702, + "kl": 0.0008883476257324219, + "learning_rate": 2.6150000000000004e-06, + "loss": 0.0, + "num_tokens": 644584.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 39.94444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3911826610565186, + "kl": 0.07657848484814167, + "learning_rate": 2.6146666666666667e-06, + "loss": 0.0198, + "num_tokens": 644905.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 2157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 39.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018378296867012978, + "kl": 0.001259174954611808, + "learning_rate": 2.6143333333333335e-06, + "loss": 0.0001, + "num_tokens": 645122.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 39.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019308192655444145, + "kl": 0.0002418264775769785, + "learning_rate": 2.614e-06, + "loss": 0.0, + "num_tokens": 645378.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 40.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010439989157021046, + "kl": 0.0017805024981498718, + "learning_rate": 2.6136666666666667e-06, + "loss": 0.0001, + "num_tokens": 645614.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 40.01851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.678936958312988, + "kl": 0.11380278319120407, + "learning_rate": 2.6133333333333334e-06, + "loss": 0.0047, + "num_tokens": 645880.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 2161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 40.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006137826945632696, + "kl": 0.0008134424861054868, + "learning_rate": 2.6130000000000002e-06, + "loss": 0.0, + "num_tokens": 646140.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 34.5, + "completions/mean_terminated_length": 34.5, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 40.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10522975027561188, + "kl": 0.05451890267431736, + "learning_rate": 2.612666666666667e-06, + "loss": 0.0027, + "num_tokens": 646494.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 32.25, + "completions/mean_terminated_length": 32.25, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 40.074074074074076, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.383107662200928, + "kl": 0.07697451114654541, + "learning_rate": 2.6123333333333334e-06, + "loss": 0.0166, + "num_tokens": 646847.0, + "reward": 4.625, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 2164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 40.092592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012628552503883839, + "kl": 0.001994713209569454, + "learning_rate": 2.612e-06, + "loss": 0.0001, + "num_tokens": 647159.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 40.111111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11814142763614655, + "kl": 0.041642939671874046, + "learning_rate": 2.6116666666666665e-06, + "loss": 0.0021, + "num_tokens": 647484.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 40.129629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014402723172679543, + "kl": 5.1952898502349854e-05, + "learning_rate": 2.6113333333333333e-06, + "loss": 0.0, + "num_tokens": 647704.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 2167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 40.148148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08754493296146393, + "kl": 0.0054903654381632805, + "learning_rate": 2.611e-06, + "loss": 0.0003, + "num_tokens": 648000.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 34.25, + "completions/mean_terminated_length": 34.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 40.166666666666664, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6392521858215332, + "kl": 0.02249107463285327, + "learning_rate": 2.610666666666667e-06, + "loss": 0.0868, + "num_tokens": 648417.0, + "reward": 2.799999952316284, + "reward_std": 0.4000000059604645, + "rewards/reward_combined/mean": 2.799999952316284, + "rewards/reward_combined/std": 0.4000000059604645, + "step": 2169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 40.18518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6833618879318237, + "kl": 0.08711736090481281, + "learning_rate": 2.6103333333333332e-06, + "loss": 0.0041, + "num_tokens": 648716.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 40.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06694292277097702, + "kl": 0.006061729276552796, + "learning_rate": 2.61e-06, + "loss": 0.0003, + "num_tokens": 649044.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 40.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011420521885156631, + "kl": 0.000582220294745639, + "learning_rate": 2.6096666666666668e-06, + "loss": 0.0, + "num_tokens": 649279.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 40.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054911404848098755, + "kl": 0.01224487042054534, + "learning_rate": 2.609333333333333e-06, + "loss": 0.0006, + "num_tokens": 649547.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 40.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10499117523431778, + "kl": 0.011184069328010082, + "learning_rate": 2.6090000000000003e-06, + "loss": 0.0006, + "num_tokens": 649807.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 40.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05834371969103813, + "kl": 0.012211961671710014, + "learning_rate": 2.6086666666666667e-06, + "loss": 0.0006, + "num_tokens": 650139.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.25, + "completions/mean_terminated_length": 5.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 40.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3531286418437958, + "kl": 0.020087300217710435, + "learning_rate": 2.6083333333333335e-06, + "loss": 0.0012, + "num_tokens": 650360.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 40.31481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3746749460697174, + "kl": 0.0676138773560524, + "learning_rate": 2.608e-06, + "loss": 0.0037, + "num_tokens": 650671.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 40.333333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19808438420295715, + "kl": 0.05435522459447384, + "learning_rate": 2.6076666666666666e-06, + "loss": 0.0027, + "num_tokens": 650963.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 40.351851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08828325569629669, + "kl": 0.015800793655216694, + "learning_rate": 2.6073333333333334e-06, + "loss": 0.0008, + "num_tokens": 651251.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 40.370370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024463383480906487, + "kl": 0.09496597200632095, + "learning_rate": 2.607e-06, + "loss": 0.0047, + "num_tokens": 651615.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 40.388888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00997218769043684, + "kl": 0.26684069633483887, + "learning_rate": 2.606666666666667e-06, + "loss": 0.0133, + "num_tokens": 651919.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 40.407407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1911213994026184, + "kl": 0.011752256192266941, + "learning_rate": 2.6063333333333333e-06, + "loss": 0.0006, + "num_tokens": 652243.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 40.425925925925924, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.499086856842041, + "kl": 0.034339187666773796, + "learning_rate": 2.606e-06, + "loss": 0.0062, + "num_tokens": 652563.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 2183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 40.44444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.824836730957031, + "kl": 0.003653585212305188, + "learning_rate": 2.6056666666666665e-06, + "loss": 0.0385, + "num_tokens": 652837.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 2184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 40.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06412987411022186, + "kl": 0.007954416330903769, + "learning_rate": 2.6053333333333333e-06, + "loss": 0.0004, + "num_tokens": 653119.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 8.25, + "completions/mean_terminated_length": 8.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 40.48148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.557842254638672, + "kl": 0.029613006860017776, + "learning_rate": 2.605e-06, + "loss": 0.3348, + "num_tokens": 653352.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 2186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 40.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2712440490722656, + "kl": 0.10826662555336952, + "learning_rate": 2.604666666666667e-06, + "loss": 0.0055, + "num_tokens": 653681.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 40.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009522831067442894, + "kl": 0.0020661503076553345, + "learning_rate": 2.604333333333333e-06, + "loss": 0.0001, + "num_tokens": 653917.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 40.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09133666008710861, + "kl": 0.09584910795092583, + "learning_rate": 2.604e-06, + "loss": 0.0048, + "num_tokens": 654214.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 40.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006647504400461912, + "kl": 0.010639351326972246, + "learning_rate": 2.6036666666666668e-06, + "loss": 0.0005, + "num_tokens": 654486.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 40.574074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0802210196852684, + "kl": 0.0038588105235248804, + "learning_rate": 2.6033333333333335e-06, + "loss": 0.0002, + "num_tokens": 654729.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 40.592592592592595, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5295495986938477, + "kl": 0.013125494122505188, + "learning_rate": 2.6030000000000003e-06, + "loss": 0.0336, + "num_tokens": 655063.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 2192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 40.611111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11449353396892548, + "kl": 0.02009360957890749, + "learning_rate": 2.6026666666666667e-06, + "loss": 0.001, + "num_tokens": 655323.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 40.629629629629626, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.113491058349609, + "kl": 0.15874231606721878, + "learning_rate": 2.6023333333333335e-06, + "loss": 0.2664, + "num_tokens": 655661.0, + "reward": 4.5, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 4.041451930999756, + "step": 2194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 40.648148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056525591760873795, + "kl": 0.0020536035299301147, + "learning_rate": 2.602e-06, + "loss": 0.0001, + "num_tokens": 655877.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 47.0, + "completions/max_terminated_length": 47.0, + "completions/mean_length": 33.0, + "completions/mean_terminated_length": 33.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 40.666666666666664, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9637739658355713, + "kl": 0.048197727650403976, + "learning_rate": 2.6016666666666666e-06, + "loss": 0.1346, + "num_tokens": 656245.0, + "reward": 2.875, + "reward_std": 4.346933841705322, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 4.346933841705322, + "step": 2196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 40.68518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013585948618128896, + "kl": 0.0011265341890975833, + "learning_rate": 2.6013333333333334e-06, + "loss": 0.0001, + "num_tokens": 656525.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 40.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056193895637989044, + "kl": 0.008010640507563949, + "learning_rate": 2.601e-06, + "loss": 0.0004, + "num_tokens": 656798.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 40.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02086481638252735, + "kl": 0.0006685405969619751, + "learning_rate": 2.600666666666667e-06, + "loss": 0.0, + "num_tokens": 657010.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 40.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01918591372668743, + "kl": 0.0019661039113998413, + "learning_rate": 2.6003333333333333e-06, + "loss": 0.0001, + "num_tokens": 657270.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 40.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07656833529472351, + "kl": 0.002072945237159729, + "learning_rate": 2.6e-06, + "loss": 0.0001, + "num_tokens": 657480.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 29.25, + "completions/mean_terminated_length": 29.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 40.77777777777778, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.556825876235962, + "kl": 0.07838684506714344, + "learning_rate": 2.5996666666666665e-06, + "loss": 0.1594, + "num_tokens": 657817.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 2202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 40.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05457224324345589, + "kl": 0.007320788223296404, + "learning_rate": 2.5993333333333337e-06, + "loss": 0.0004, + "num_tokens": 658089.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 40.81481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10989849269390106, + "kl": 0.007729566190391779, + "learning_rate": 2.599e-06, + "loss": 0.0004, + "num_tokens": 658415.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 40.833333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11126889288425446, + "kl": 0.020552767906337976, + "learning_rate": 2.598666666666667e-06, + "loss": 0.001, + "num_tokens": 658712.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 40.851851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00414922134950757, + "kl": 0.016017152927815914, + "learning_rate": 2.5983333333333336e-06, + "loss": 0.0008, + "num_tokens": 658972.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 40.870370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10008968412876129, + "kl": 0.005681299197021872, + "learning_rate": 2.598e-06, + "loss": 0.0002, + "num_tokens": 659248.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 40.888888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.26465556025505066, + "kl": 0.008169974316842854, + "learning_rate": 2.5976666666666667e-06, + "loss": 0.0004, + "num_tokens": 659504.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 40.907407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017296859994530678, + "kl": 0.0007104054093360901, + "learning_rate": 2.5973333333333335e-06, + "loss": 0.0, + "num_tokens": 659764.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 40.925925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03946181386709213, + "kl": 0.0026910784072242677, + "learning_rate": 2.5970000000000003e-06, + "loss": 0.0001, + "num_tokens": 660076.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 40.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013506383635103703, + "kl": 0.0015528385993093252, + "learning_rate": 2.5966666666666667e-06, + "loss": 0.0001, + "num_tokens": 660360.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 40.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052044857293367386, + "kl": 0.008261523442342877, + "learning_rate": 2.5963333333333334e-06, + "loss": 0.0004, + "num_tokens": 660660.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 40.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16538044810295105, + "kl": 0.02766789309680462, + "learning_rate": 2.596e-06, + "loss": 0.0014, + "num_tokens": 660954.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 41.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031174039468169212, + "kl": 0.006419439101591706, + "learning_rate": 2.5956666666666666e-06, + "loss": 0.0003, + "num_tokens": 661255.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 41.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009693685919046402, + "kl": 0.002014942467212677, + "learning_rate": 2.5953333333333334e-06, + "loss": 0.0001, + "num_tokens": 661491.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 41.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03078620508313179, + "kl": 0.001665353775024414, + "learning_rate": 2.595e-06, + "loss": 0.0001, + "num_tokens": 661703.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 41.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008324520662426949, + "kl": 0.0007139469380490482, + "learning_rate": 2.594666666666667e-06, + "loss": 0.0, + "num_tokens": 661975.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 41.074074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1106836125254631, + "kl": 0.016338031506165862, + "learning_rate": 2.5943333333333333e-06, + "loss": 0.0009, + "num_tokens": 662299.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 41.092592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22612765431404114, + "kl": 0.026517196791246533, + "learning_rate": 2.594e-06, + "loss": 0.0014, + "num_tokens": 662640.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 7.0, + "completions/mean_terminated_length": 7.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 41.111111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06424561142921448, + "kl": 0.0029029519064351916, + "learning_rate": 2.5936666666666664e-06, + "loss": 0.0002, + "num_tokens": 662888.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 41.129629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3038100004196167, + "kl": 0.07607274036854506, + "learning_rate": 2.5933333333333336e-06, + "loss": 0.0033, + "num_tokens": 663185.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 41.148148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02874799631536007, + "kl": 0.09451911970973015, + "learning_rate": 2.593e-06, + "loss": 0.0047, + "num_tokens": 663549.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 41.166666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02398524433374405, + "kl": 0.0052295564673841, + "learning_rate": 2.5926666666666668e-06, + "loss": 0.0003, + "num_tokens": 663819.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 41.18518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18163734674453735, + "kl": 0.047916144132614136, + "learning_rate": 2.5923333333333336e-06, + "loss": 0.0024, + "num_tokens": 664088.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 41.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3485828936100006, + "kl": 0.042738866060972214, + "learning_rate": 2.592e-06, + "loss": 0.0022, + "num_tokens": 664384.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 41.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.4673593044281006, + "kl": 0.13859406113624573, + "learning_rate": 2.5916666666666667e-06, + "loss": 0.0072, + "num_tokens": 664656.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 41.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06750496476888657, + "kl": 0.007245576241984963, + "learning_rate": 2.5913333333333335e-06, + "loss": 0.0004, + "num_tokens": 664961.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 41.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07238642871379852, + "kl": 0.022575938142836094, + "learning_rate": 2.5910000000000003e-06, + "loss": 0.0011, + "num_tokens": 665289.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 41.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006112195551395416, + "kl": 0.0010635495418682694, + "learning_rate": 2.5906666666666666e-06, + "loss": 0.0001, + "num_tokens": 665549.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 41.2962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1886634826660156, + "kl": 0.01873736083507538, + "learning_rate": 2.5903333333333334e-06, + "loss": -0.0017, + "num_tokens": 665845.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 2230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 41.31481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028272408992052078, + "kl": 0.0009019679855555296, + "learning_rate": 2.5899999999999998e-06, + "loss": 0.0, + "num_tokens": 666079.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 41.333333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06380181014537811, + "kl": 0.0018694892642088234, + "learning_rate": 2.5896666666666665e-06, + "loss": 0.0001, + "num_tokens": 666335.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 41.351851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2883283793926239, + "kl": 0.06977497786283493, + "learning_rate": 2.5893333333333338e-06, + "loss": 0.0033, + "num_tokens": 666649.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 41.370370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06622041761875153, + "kl": 0.002407582842351985, + "learning_rate": 2.589e-06, + "loss": 0.0001, + "num_tokens": 666921.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 28.0, + "completions/mean_terminated_length": 28.0, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 41.388888888888886, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.309058666229248, + "kl": 0.09977884218096733, + "learning_rate": 2.588666666666667e-06, + "loss": 0.0659, + "num_tokens": 667261.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 2235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 43.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 80.0, + "completions/max_terminated_length": 80.0, + "completions/mean_length": 43.5, + "completions/mean_terminated_length": 43.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 41.407407407407405, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4508535861968994, + "kl": 0.032848202623426914, + "learning_rate": 2.5883333333333333e-06, + "loss": 0.2215, + "num_tokens": 667715.0, + "reward": 2.174999952316284, + "reward_std": 1.649999976158142, + "rewards/reward_combined/mean": 2.174999952316284, + "rewards/reward_combined/std": 1.649999976158142, + "step": 2236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 28.5, + "completions/mean_terminated_length": 28.5, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 41.425925925925924, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3594186305999756, + "kl": 0.10897567868232727, + "learning_rate": 2.588e-06, + "loss": 0.0362, + "num_tokens": 668065.0, + "reward": 4.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 2.0, + "step": 2237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 41.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013281864114105701, + "kl": 0.0006109159730840474, + "learning_rate": 2.587666666666667e-06, + "loss": 0.0, + "num_tokens": 668382.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 41.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12697184085845947, + "kl": 0.005401700735092163, + "learning_rate": 2.5873333333333336e-06, + "loss": 0.0003, + "num_tokens": 668626.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 41.48148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.261502742767334, + "kl": 0.06200794130563736, + "learning_rate": 2.587e-06, + "loss": -0.0072, + "num_tokens": 668915.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 2240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 41.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018267983570694923, + "kl": 0.16118168830871582, + "learning_rate": 2.5866666666666667e-06, + "loss": 0.0081, + "num_tokens": 669223.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 41.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.27868735790252686, + "kl": 0.04014579672366381, + "learning_rate": 2.5863333333333335e-06, + "loss": 0.002, + "num_tokens": 669550.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 41.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04165789112448692, + "kl": 0.012051904574036598, + "learning_rate": 2.586e-06, + "loss": 0.0006, + "num_tokens": 669818.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 41.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04134419187903404, + "kl": 0.00279614538885653, + "learning_rate": 2.5856666666666667e-06, + "loss": 0.0001, + "num_tokens": 670124.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2244 + }, + { + "clip_ratio/high_max": 0.004629629664123058, + "clip_ratio/high_mean": 0.004629629664123058, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004629629664123058, + "completion_length": 36.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 36.0, + "completions/mean_terminated_length": 36.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 41.574074074074076, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3592734336853027, + "kl": 0.06017509289085865, + "learning_rate": 2.5853333333333335e-06, + "loss": 0.1433, + "num_tokens": 670488.0, + "reward": 5.25, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 4.5, + "step": 2245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 41.592592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.31126633286476135, + "kl": 0.022451738826930523, + "learning_rate": 2.5850000000000002e-06, + "loss": 0.0011, + "num_tokens": 670750.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 41.611111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16258597373962402, + "kl": 0.057327122427523136, + "learning_rate": 2.5846666666666666e-06, + "loss": 0.0028, + "num_tokens": 671069.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 41.629629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03757637366652489, + "kl": 0.00690287712495774, + "learning_rate": 2.5843333333333334e-06, + "loss": 0.0003, + "num_tokens": 671358.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 41.648148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004391350317746401, + "kl": 0.015965756960213184, + "learning_rate": 2.5839999999999997e-06, + "loss": 0.0008, + "num_tokens": 671618.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 41.666666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012767374282702804, + "kl": 0.0011478961096145213, + "learning_rate": 2.5836666666666665e-06, + "loss": 0.0001, + "num_tokens": 671898.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 41.68518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18727311491966248, + "kl": 0.0045321062207221985, + "learning_rate": 2.5833333333333337e-06, + "loss": 0.0002, + "num_tokens": 672110.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 41.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048203155398368835, + "kl": 0.009726321091875434, + "learning_rate": 2.583e-06, + "loss": 0.0005, + "num_tokens": 672394.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 41.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02354181371629238, + "kl": 0.001652780920267105, + "learning_rate": 2.582666666666667e-06, + "loss": 0.0001, + "num_tokens": 672706.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 2.5, + "completions/mean_terminated_length": 2.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 41.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022749602794647217, + "kl": 0.0013177543878555298, + "learning_rate": 2.5823333333333332e-06, + "loss": 0.0001, + "num_tokens": 672912.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 41.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11462816596031189, + "kl": 0.011050291825085878, + "learning_rate": 2.582e-06, + "loss": 0.0006, + "num_tokens": 673197.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 41.77777777777778, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2037692070007324, + "kl": 1.0258885622024536, + "learning_rate": 2.581666666666667e-06, + "loss": 0.0678, + "num_tokens": 673502.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 2256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 33.0, + "completions/mean_terminated_length": 33.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 41.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5264981389045715, + "kl": 0.12669705785810947, + "learning_rate": 2.5813333333333336e-06, + "loss": 0.0065, + "num_tokens": 673858.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 41.81481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08587709069252014, + "kl": 0.009181471075862646, + "learning_rate": 2.581e-06, + "loss": 0.0005, + "num_tokens": 674147.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 41.833333333333336, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.749634742736816, + "kl": 0.12228460609912872, + "learning_rate": 2.5806666666666667e-06, + "loss": 0.1727, + "num_tokens": 674433.0, + "reward": 6.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 6.5, + "rewards/reward_combined/std": 2.0, + "step": 2259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 36.5, + "completions/mean_terminated_length": 36.5, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 41.851851851851855, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.5517964363098145, + "kl": 0.4932323209941387, + "learning_rate": 2.5803333333333335e-06, + "loss": 0.0077, + "num_tokens": 674795.0, + "reward": 3.375, + "reward_std": 3.3008837699890137, + "rewards/reward_combined/mean": 3.375, + "rewards/reward_combined/std": 3.3008837699890137, + "step": 2260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 41.870370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12199553847312927, + "kl": 0.01308013778179884, + "learning_rate": 2.58e-06, + "loss": 0.0006, + "num_tokens": 675088.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 41.888888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004929345101118088, + "kl": 0.0030177757143974304, + "learning_rate": 2.5796666666666666e-06, + "loss": 0.0002, + "num_tokens": 675304.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2262 + }, + { + "clip_ratio/high_max": 0.015625, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 41.907407407407405, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.304366111755371, + "kl": 0.07195407338440418, + "learning_rate": 2.5793333333333334e-06, + "loss": 0.0025, + "num_tokens": 675602.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 2263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 41.925925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07109731435775757, + "kl": 0.0060296617448329926, + "learning_rate": 2.579e-06, + "loss": 0.0003, + "num_tokens": 675930.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 41.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16094832122325897, + "kl": 0.014843345154076815, + "learning_rate": 2.5786666666666666e-06, + "loss": 0.0009, + "num_tokens": 676196.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 41.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018008019542321563, + "kl": 7.145851850509644e-05, + "learning_rate": 2.5783333333333334e-06, + "loss": 0.0, + "num_tokens": 676416.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 2266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.5, + "completions/mean_terminated_length": 4.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 41.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17287349700927734, + "kl": 0.005106365540996194, + "learning_rate": 2.5779999999999997e-06, + "loss": 0.0002, + "num_tokens": 676634.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 42.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007543537649326026, + "kl": 0.0004306808114051819, + "learning_rate": 2.577666666666667e-06, + "loss": 0.0, + "num_tokens": 676894.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 42.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.6143964529037476, + "kl": 0.1721301469951868, + "learning_rate": 2.5773333333333337e-06, + "loss": 0.0082, + "num_tokens": 677194.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 42.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027700692415237427, + "kl": 0.153845876455307, + "learning_rate": 2.577e-06, + "loss": 0.0077, + "num_tokens": 677505.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 42.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01927250064909458, + "kl": 0.0012126043438911438, + "learning_rate": 2.576666666666667e-06, + "loss": 0.0001, + "num_tokens": 677713.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 42.074074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005752946250140667, + "kl": 6.977468729019165e-05, + "learning_rate": 2.576333333333333e-06, + "loss": 0.0, + "num_tokens": 677925.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 42.092592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07913000136613846, + "kl": 0.008182714227586985, + "learning_rate": 2.576e-06, + "loss": 0.0004, + "num_tokens": 678199.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 29.0, + "completions/mean_terminated_length": 29.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 42.111111111111114, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6019484996795654, + "kl": 0.11625132523477077, + "learning_rate": 2.5756666666666668e-06, + "loss": 0.1041, + "num_tokens": 678535.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 2274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 83.5, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 83.5, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 42.129629629629626, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0931789875030518, + "kl": 0.022031554020941257, + "learning_rate": 2.5753333333333336e-06, + "loss": 0.438, + "num_tokens": 679097.0, + "reward": 4.375, + "reward_std": 4.190763473510742, + "rewards/reward_combined/mean": 4.375, + "rewards/reward_combined/std": 4.190763473510742, + "step": 2275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 42.148148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.27651286125183105, + "kl": 0.051644254475831985, + "learning_rate": 2.575e-06, + "loss": 0.0026, + "num_tokens": 679423.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 42.166666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006301445886492729, + "kl": 0.00047616162919439375, + "learning_rate": 2.5746666666666667e-06, + "loss": 0.0, + "num_tokens": 679695.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 42.18518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044776782393455505, + "kl": 0.039941176772117615, + "learning_rate": 2.5743333333333335e-06, + "loss": 0.002, + "num_tokens": 680099.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 2278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 42.2037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.366850852966309, + "kl": 0.018985837697982788, + "learning_rate": 2.574e-06, + "loss": 0.1862, + "num_tokens": 680375.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 2279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 42.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06638304889202118, + "kl": 0.002032451331615448, + "learning_rate": 2.573666666666667e-06, + "loss": 0.0001, + "num_tokens": 680635.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 42.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003982122987508774, + "kl": 0.0008017778454814106, + "learning_rate": 2.5733333333333334e-06, + "loss": 0.0, + "num_tokens": 680895.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 42.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8614780902862549, + "kl": 0.13447848334908485, + "learning_rate": 2.573e-06, + "loss": 0.0069, + "num_tokens": 681219.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 42.27777777777778, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7912726402282715, + "kl": 0.12185978144407272, + "learning_rate": 2.5726666666666665e-06, + "loss": -0.0597, + "num_tokens": 681527.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 2283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 42.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3246501088142395, + "kl": 0.03552582301199436, + "learning_rate": 2.5723333333333333e-06, + "loss": 0.0018, + "num_tokens": 681863.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 42.31481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5744963884353638, + "kl": 0.041536884382367134, + "learning_rate": 2.572e-06, + "loss": 0.0022, + "num_tokens": 682159.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 42.333333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11229992657899857, + "kl": 0.04291626671329141, + "learning_rate": 2.571666666666667e-06, + "loss": 0.0021, + "num_tokens": 682449.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 42.351851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006222181487828493, + "kl": 0.015481793321669102, + "learning_rate": 2.5713333333333337e-06, + "loss": 0.0008, + "num_tokens": 682709.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 42.370370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030857743695378304, + "kl": 0.0016200148966163397, + "learning_rate": 2.571e-06, + "loss": 0.0001, + "num_tokens": 683005.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 42.388888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10765678435564041, + "kl": 0.0046082064509391785, + "learning_rate": 2.570666666666667e-06, + "loss": 0.0002, + "num_tokens": 683225.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 2289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 42.407407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011586115695536137, + "kl": 0.0014963998110033572, + "learning_rate": 2.570333333333333e-06, + "loss": 0.0001, + "num_tokens": 683509.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 42.425925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09506388008594513, + "kl": 0.009406433149706572, + "learning_rate": 2.57e-06, + "loss": 0.0005, + "num_tokens": 683808.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 42.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06232907250523567, + "kl": 0.01271789101883769, + "learning_rate": 2.5696666666666667e-06, + "loss": 0.0006, + "num_tokens": 684127.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 42.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17157714068889618, + "kl": 0.04104907996952534, + "learning_rate": 2.5693333333333335e-06, + "loss": 0.002, + "num_tokens": 684397.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 42.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2651209831237793, + "kl": 0.034470973536372185, + "learning_rate": 2.569e-06, + "loss": 0.0021, + "num_tokens": 684679.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 77.0, + "completions/max_terminated_length": 77.0, + "completions/mean_length": 30.25, + "completions/mean_terminated_length": 30.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 42.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12067551165819168, + "kl": 0.011610866524279118, + "learning_rate": 2.5686666666666667e-06, + "loss": 0.0007, + "num_tokens": 685012.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 42.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12334544211626053, + "kl": 0.012893387116491795, + "learning_rate": 2.5683333333333334e-06, + "loss": 0.0006, + "num_tokens": 685272.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 42.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06318001449108124, + "kl": 0.008761949837207794, + "learning_rate": 2.568e-06, + "loss": 0.0004, + "num_tokens": 685488.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 42.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042493775486946106, + "kl": 0.008366410853341222, + "learning_rate": 2.567666666666667e-06, + "loss": 0.0004, + "num_tokens": 685778.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 42.574074074074076, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.58232307434082, + "kl": 0.09087443561293185, + "learning_rate": 2.5673333333333334e-06, + "loss": 0.011, + "num_tokens": 686046.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 2299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 42.592592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00357165839523077, + "kl": 0.00012801885895896703, + "learning_rate": 2.567e-06, + "loss": 0.0, + "num_tokens": 686266.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 42.611111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033808790147304535, + "kl": 0.007193173747509718, + "learning_rate": 2.5666666666666665e-06, + "loss": 0.0004, + "num_tokens": 686554.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 42.629629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05844135582447052, + "kl": 0.009986089775338769, + "learning_rate": 2.5663333333333333e-06, + "loss": 0.0005, + "num_tokens": 686826.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 42.648148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03625547140836716, + "kl": 0.006588557502254844, + "learning_rate": 2.566e-06, + "loss": 0.0003, + "num_tokens": 687169.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 42.666666666666664, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.191218852996826, + "kl": 0.07684733346104622, + "learning_rate": 2.565666666666667e-06, + "loss": 0.0379, + "num_tokens": 687499.0, + "reward": 3.5, + "reward_std": 2.915475845336914, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 2.915475845336914, + "step": 2304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 42.68518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030018970370292664, + "kl": 0.09379787370562553, + "learning_rate": 2.5653333333333336e-06, + "loss": 0.0047, + "num_tokens": 687863.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.014705882407724857, + "clip_ratio/low_min": 0.014705882407724857, + "clip_ratio/region_mean": 0.014705882407724857, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 42.7037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.721210479736328, + "kl": 0.02078762650489807, + "learning_rate": 2.565e-06, + "loss": 0.4353, + "num_tokens": 688109.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 2306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 42.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08929096907377243, + "kl": 0.003990313387475908, + "learning_rate": 2.564666666666667e-06, + "loss": 0.0002, + "num_tokens": 688365.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 42.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015159958973526955, + "kl": 0.0008501690026605502, + "learning_rate": 2.564333333333333e-06, + "loss": 0.0, + "num_tokens": 688684.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 42.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026792660355567932, + "kl": 0.0021284203976392746, + "learning_rate": 2.564e-06, + "loss": 0.0001, + "num_tokens": 688996.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 29.25, + "completions/mean_terminated_length": 29.25, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 42.77777777777778, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0151569843292236, + "kl": 0.06188581883907318, + "learning_rate": 2.5636666666666667e-06, + "loss": -0.0628, + "num_tokens": 689341.0, + "reward": 6.5, + "reward_std": 2.345207929611206, + "rewards/reward_combined/mean": 6.5, + "rewards/reward_combined/std": 2.345207929611206, + "step": 2310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 42.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03259953111410141, + "kl": 0.009544974192976952, + "learning_rate": 2.5633333333333335e-06, + "loss": 0.0005, + "num_tokens": 689609.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 42.81481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057312652468681335, + "kl": 0.016362751834094524, + "learning_rate": 2.563e-06, + "loss": 0.0008, + "num_tokens": 689911.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 42.833333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005066038575023413, + "kl": 0.0006015742546878755, + "learning_rate": 2.5626666666666666e-06, + "loss": 0.0, + "num_tokens": 690154.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 42.851851851851855, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.187950611114502, + "kl": 0.04966283682733774, + "learning_rate": 2.5623333333333334e-06, + "loss": 0.0011, + "num_tokens": 690490.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 2314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 42.870370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00905273575335741, + "kl": 0.002031169831752777, + "learning_rate": 2.562e-06, + "loss": 0.0001, + "num_tokens": 690726.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 28.0, + "completions/mean_terminated_length": 28.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 42.888888888888886, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1744160652160645, + "kl": 0.10203511267900467, + "learning_rate": 2.561666666666667e-06, + "loss": 0.0621, + "num_tokens": 691074.0, + "reward": 4.25, + "reward_std": 2.1794495582580566, + "rewards/reward_combined/mean": 4.25, + "rewards/reward_combined/std": 2.1794495582580566, + "step": 2316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 42.907407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06537610292434692, + "kl": 0.005710832541808486, + "learning_rate": 2.5613333333333333e-06, + "loss": 0.0003, + "num_tokens": 691351.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 42.925925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0857694149017334, + "kl": 0.010313042905181646, + "learning_rate": 2.561e-06, + "loss": 0.0005, + "num_tokens": 691633.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 32.5, + "completions/mean_terminated_length": 32.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 42.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.25820374488830566, + "kl": 0.06344599276781082, + "learning_rate": 2.5606666666666665e-06, + "loss": 0.0031, + "num_tokens": 691979.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 42.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061222560703754425, + "kl": 0.0075517280492931604, + "learning_rate": 2.5603333333333333e-06, + "loss": 0.0004, + "num_tokens": 692278.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 42.98148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.756105899810791, + "kl": 0.1902681589126587, + "learning_rate": 2.56e-06, + "loss": 0.0247, + "num_tokens": 692583.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 2321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 7.0, + "completions/mean_terminated_length": 7.0, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 43.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022758089005947113, + "kl": 0.0007738002750556916, + "learning_rate": 2.559666666666667e-06, + "loss": 0.0, + "num_tokens": 692819.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 43.01851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.667349100112915, + "kl": 0.08784319180995226, + "learning_rate": 2.5593333333333336e-06, + "loss": 0.0072, + "num_tokens": 693125.0, + "reward": 5.75, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 2.598076105117798, + "step": 2323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.75, + "completions/mean_terminated_length": 11.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 43.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2427816092967987, + "kl": 0.030141443479806185, + "learning_rate": 2.559e-06, + "loss": 0.0016, + "num_tokens": 693396.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 43.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04597758874297142, + "kl": 0.0011098682880401611, + "learning_rate": 2.5586666666666668e-06, + "loss": 0.0001, + "num_tokens": 693608.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 43.074074074074076, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.068799018859863, + "kl": 0.030716415494680405, + "learning_rate": 2.558333333333333e-06, + "loss": 0.0109, + "num_tokens": 693915.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 2326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 43.092592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02640840783715248, + "kl": 0.0005000904202461243, + "learning_rate": 2.5580000000000003e-06, + "loss": 0.0, + "num_tokens": 694123.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 43.111111111111114, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.983851432800293, + "kl": 0.016102399677038193, + "learning_rate": 2.5576666666666667e-06, + "loss": 0.1011, + "num_tokens": 694465.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 2328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 43.129629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007450214121490717, + "kl": 0.00421206234022975, + "learning_rate": 2.5573333333333335e-06, + "loss": 0.0002, + "num_tokens": 694733.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 43.148148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13418050110340118, + "kl": 0.003793664276599884, + "learning_rate": 2.5570000000000003e-06, + "loss": 0.0002, + "num_tokens": 694977.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 43.166666666666664, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.633325576782227, + "kl": 0.08090226771309972, + "learning_rate": 2.5566666666666666e-06, + "loss": 0.002, + "num_tokens": 695271.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 2331 + }, + { + "clip_ratio/high_max": 0.021739130839705467, + "clip_ratio/high_mean": 0.021739130839705467, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.021739130839705467, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 43.18518518518518, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.948868751525879, + "kl": 0.02674010396003723, + "learning_rate": 2.5563333333333334e-06, + "loss": 0.1643, + "num_tokens": 695555.0, + "reward": 7.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.875, + "rewards/reward_combined/std": 0.25, + "step": 2332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 43.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02924892120063305, + "kl": 0.04285065270960331, + "learning_rate": 2.556e-06, + "loss": 0.0021, + "num_tokens": 695959.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 2333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 43.22222222222222, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.4958176612854, + "kl": 0.26358838722808287, + "learning_rate": 2.555666666666667e-06, + "loss": 0.0683, + "num_tokens": 696246.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 2334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 43.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10956063121557236, + "kl": 0.013329327572137117, + "learning_rate": 2.5553333333333333e-06, + "loss": 0.0007, + "num_tokens": 696506.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 35.5, + "completions/mean_terminated_length": 35.5, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 43.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6764758229255676, + "kl": 0.07917577400803566, + "learning_rate": 2.555e-06, + "loss": 0.0042, + "num_tokens": 696872.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 43.27777777777778, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.057281970977783, + "kl": 0.028899877332150936, + "learning_rate": 2.5546666666666665e-06, + "loss": 0.0031, + "num_tokens": 697144.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 2337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 43.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02428019791841507, + "kl": 0.09250488132238388, + "learning_rate": 2.5543333333333332e-06, + "loss": 0.0046, + "num_tokens": 697510.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 43.31481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0939767062664032, + "kl": 0.02160784974694252, + "learning_rate": 2.554e-06, + "loss": 0.0011, + "num_tokens": 697826.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 43.333333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06708146631717682, + "kl": 0.005683839903213084, + "learning_rate": 2.553666666666667e-06, + "loss": 0.0003, + "num_tokens": 698103.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 43.351851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01883876323699951, + "kl": 0.00022596716735279188, + "learning_rate": 2.5533333333333336e-06, + "loss": 0.0, + "num_tokens": 698359.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 43.370370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05525289848446846, + "kl": 0.003440507600316778, + "learning_rate": 2.553e-06, + "loss": 0.0002, + "num_tokens": 698623.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 43.388888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0023331388365477324, + "kl": 0.0014758408069610596, + "learning_rate": 2.5526666666666667e-06, + "loss": 0.0001, + "num_tokens": 698935.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 43.407407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09795980155467987, + "kl": 0.02744582900777459, + "learning_rate": 2.552333333333333e-06, + "loss": 0.0013, + "num_tokens": 699210.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 43.425925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006187402177602053, + "kl": 0.01566222310066223, + "learning_rate": 2.5520000000000003e-06, + "loss": 0.0008, + "num_tokens": 699470.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 32.0, + "completions/mean_terminated_length": 32.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 43.44444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.402616024017334, + "kl": 0.03923051059246063, + "learning_rate": 2.5516666666666667e-06, + "loss": 0.0159, + "num_tokens": 699850.0, + "reward": 3.875, + "reward_std": 4.190763473510742, + "rewards/reward_combined/mean": 3.875, + "rewards/reward_combined/std": 4.190763473510742, + "step": 2346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 5.75, + "completions/mean_terminated_length": 5.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 43.46296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.186767578125, + "kl": 0.08794543892145157, + "learning_rate": 2.5513333333333334e-06, + "loss": 0.2411, + "num_tokens": 700073.0, + "reward": 3.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 3.875, + "rewards/reward_combined/std": 0.25, + "step": 2347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 43.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2174459546804428, + "kl": 0.02345143910497427, + "learning_rate": 2.5510000000000002e-06, + "loss": 0.0012, + "num_tokens": 700365.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 43.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06970176845788956, + "kl": 0.0035487039713189006, + "learning_rate": 2.5506666666666666e-06, + "loss": 0.0002, + "num_tokens": 700632.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 43.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002296097343787551, + "kl": 0.0004628002643585205, + "learning_rate": 2.5503333333333334e-06, + "loss": 0.0, + "num_tokens": 700892.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 43.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.1953400373458862, + "kl": 0.09098661225289106, + "learning_rate": 2.55e-06, + "loss": 0.0038, + "num_tokens": 701166.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 43.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3264387845993042, + "kl": 0.030001981183886528, + "learning_rate": 2.549666666666667e-06, + "loss": 0.001, + "num_tokens": 701420.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 34.5, + "completions/mean_terminated_length": 34.5, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 43.574074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20210789144039154, + "kl": 0.053033437579870224, + "learning_rate": 2.5493333333333333e-06, + "loss": 0.0026, + "num_tokens": 701774.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 43.592592592592595, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4588561058044434, + "kl": 0.08353149518370628, + "learning_rate": 2.549e-06, + "loss": -0.0189, + "num_tokens": 702108.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 2354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 43.611111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009912577457726002, + "kl": 0.00048766733380034566, + "learning_rate": 2.5486666666666664e-06, + "loss": 0.0, + "num_tokens": 702428.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 43.629629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09448502212762833, + "kl": 0.007809346076101065, + "learning_rate": 2.5483333333333332e-06, + "loss": 0.0004, + "num_tokens": 702662.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 43.648148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01845453679561615, + "kl": 0.2656458467245102, + "learning_rate": 2.5480000000000004e-06, + "loss": 0.0133, + "num_tokens": 702966.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 43.666666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.126014447247144e-06, + "kl": 2.4139881134033203e-06, + "learning_rate": 2.5476666666666668e-06, + "loss": 0.0, + "num_tokens": 703186.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 2358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 43.68518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015084311366081238, + "kl": 0.0018152159755118191, + "learning_rate": 2.5473333333333336e-06, + "loss": 0.0001, + "num_tokens": 703470.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 43.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010754057206213474, + "kl": 0.0018049031496047974, + "learning_rate": 2.547e-06, + "loss": 0.0001, + "num_tokens": 703706.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 43.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.30488014221191406, + "kl": 0.022927945014089346, + "learning_rate": 2.5466666666666667e-06, + "loss": 0.0011, + "num_tokens": 704040.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 43.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1744215041399002, + "kl": 0.023055229801684618, + "learning_rate": 2.5463333333333335e-06, + "loss": 0.001, + "num_tokens": 704338.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 43.75925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.003176689147949, + "kl": 0.08643694035708904, + "learning_rate": 2.5460000000000003e-06, + "loss": 0.0184, + "num_tokens": 704627.0, + "reward": 5.75, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 2.598076105117798, + "step": 2363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 43.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12822288274765015, + "kl": 0.009080796968191862, + "learning_rate": 2.5456666666666666e-06, + "loss": 0.0005, + "num_tokens": 704925.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 27.0, + "completions/mean_terminated_length": 27.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 43.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10814723372459412, + "kl": 0.022527985274791718, + "learning_rate": 2.5453333333333334e-06, + "loss": 0.0011, + "num_tokens": 705261.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 43.81481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10927625000476837, + "kl": 0.002664215862751007, + "learning_rate": 2.545e-06, + "loss": 0.0001, + "num_tokens": 705474.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 43.833333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1003035306930542, + "kl": 0.007294411770999432, + "learning_rate": 2.5446666666666666e-06, + "loss": 0.0004, + "num_tokens": 705762.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 43.851851851851855, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.265911102294922, + "kl": 0.05548745393753052, + "learning_rate": 2.5443333333333333e-06, + "loss": 0.0515, + "num_tokens": 706100.0, + "reward": 6.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 6.5, + "rewards/reward_combined/std": 2.0, + "step": 2368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 43.870370370370374, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.68215274810791, + "kl": 0.057888234965503216, + "learning_rate": 2.544e-06, + "loss": 0.0431, + "num_tokens": 706389.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 2369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 43.888888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4367368519306183, + "kl": 0.051830656826496124, + "learning_rate": 2.543666666666667e-06, + "loss": 0.0026, + "num_tokens": 706661.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 43.907407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033806316554546356, + "kl": 0.005122621078044176, + "learning_rate": 2.5433333333333333e-06, + "loss": 0.0003, + "num_tokens": 706997.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 43.925925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005678814835846424, + "kl": 0.0001820743127609603, + "learning_rate": 2.543e-06, + "loss": 0.0, + "num_tokens": 707217.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 43.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03096291422843933, + "kl": 0.15496885776519775, + "learning_rate": 2.5426666666666664e-06, + "loss": 0.0077, + "num_tokens": 707527.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 43.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006253744009882212, + "kl": 0.0012376871309243143, + "learning_rate": 2.542333333333333e-06, + "loss": 0.0001, + "num_tokens": 707839.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 43.98148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.815692901611328, + "kl": 0.04098579101264477, + "learning_rate": 2.5420000000000004e-06, + "loss": -0.0823, + "num_tokens": 708142.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 2375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 44.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24482864141464233, + "kl": 0.018099462613463402, + "learning_rate": 2.5416666666666668e-06, + "loss": 0.0009, + "num_tokens": 708438.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 44.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027715403586626053, + "kl": 0.09474433213472366, + "learning_rate": 2.5413333333333335e-06, + "loss": 0.0047, + "num_tokens": 708802.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 28.25, + "completions/mean_terminated_length": 28.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 44.03703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7770018577575684, + "kl": 0.16026543080806732, + "learning_rate": 2.541e-06, + "loss": 0.05, + "num_tokens": 709143.0, + "reward": 4.625, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 2378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 44.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010741172358393669, + "kl": 0.001871950924396515, + "learning_rate": 2.5406666666666667e-06, + "loss": 0.0001, + "num_tokens": 709379.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 44.074074074074076, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.987513780593872, + "kl": 0.0055997485760599375, + "learning_rate": 2.5403333333333335e-06, + "loss": 0.1528, + "num_tokens": 709732.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 2380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.25, + "completions/mean_terminated_length": 6.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 44.092592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13215208053588867, + "kl": 0.005676170578226447, + "learning_rate": 2.5400000000000002e-06, + "loss": 0.0003, + "num_tokens": 709965.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 44.111111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1867983490228653, + "kl": 0.03015311900526285, + "learning_rate": 2.5396666666666666e-06, + "loss": 0.0015, + "num_tokens": 710269.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 44.129629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.26857686042785645, + "kl": 0.02790098451077938, + "learning_rate": 2.5393333333333334e-06, + "loss": 0.0014, + "num_tokens": 710541.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 44.148148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031219320371747017, + "kl": 0.007754998980090022, + "learning_rate": 2.539e-06, + "loss": 0.0004, + "num_tokens": 710832.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 44.166666666666664, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.330705642700195, + "kl": 0.05509260483086109, + "learning_rate": 2.5386666666666665e-06, + "loss": 0.0654, + "num_tokens": 711154.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 2385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 44.18518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027127431705594063, + "kl": 0.00034999846684513614, + "learning_rate": 2.5383333333333333e-06, + "loss": 0.0, + "num_tokens": 711410.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 32.0, + "completions/mean_terminated_length": 32.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 44.2037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7651302814483643, + "kl": 0.030204717069864273, + "learning_rate": 2.538e-06, + "loss": -0.0754, + "num_tokens": 711762.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 2387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 44.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02672317810356617, + "kl": 0.006926621310412884, + "learning_rate": 2.537666666666667e-06, + "loss": 0.0003, + "num_tokens": 712066.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 44.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018026020377874374, + "kl": 0.00473035522736609, + "learning_rate": 2.5373333333333332e-06, + "loss": 0.0002, + "num_tokens": 712336.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 44.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20849008858203888, + "kl": 0.0290701761841774, + "learning_rate": 2.537e-06, + "loss": 0.0014, + "num_tokens": 712606.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.5, + "completions/mean_terminated_length": 4.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 44.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6695441007614136, + "kl": 0.04420142062008381, + "learning_rate": 2.5366666666666664e-06, + "loss": 0.0024, + "num_tokens": 712820.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 36.25, + "completions/mean_terminated_length": 36.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 44.2962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3384666442871094, + "kl": 0.07243164023384452, + "learning_rate": 2.5363333333333336e-06, + "loss": 0.1104, + "num_tokens": 713193.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 2392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 44.31481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.241629958152771, + "kl": 0.06414328143000603, + "learning_rate": 2.5360000000000004e-06, + "loss": 0.0032, + "num_tokens": 713519.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 44.333333333333336, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.9352216720581055, + "kl": 0.03370051831007004, + "learning_rate": 2.5356666666666667e-06, + "loss": 0.0095, + "num_tokens": 713811.0, + "reward": 4.625, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 2394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 44.351851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034529946744441986, + "kl": 0.1577390357851982, + "learning_rate": 2.5353333333333335e-06, + "loss": 0.0079, + "num_tokens": 714119.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 44.370370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007353818975389004, + "kl": 0.010421198792755604, + "learning_rate": 2.535e-06, + "loss": 0.0005, + "num_tokens": 714391.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 44.388888888888886, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.908125400543213, + "kl": 0.05387901654466987, + "learning_rate": 2.5346666666666667e-06, + "loss": -0.0429, + "num_tokens": 714683.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 2397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 44.407407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09389282017946243, + "kl": 0.0027054548263549805, + "learning_rate": 2.5343333333333334e-06, + "loss": 0.0001, + "num_tokens": 714903.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 2398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 44.425925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.32091230154037476, + "kl": 0.04450591653585434, + "learning_rate": 2.5340000000000002e-06, + "loss": 0.0022, + "num_tokens": 715172.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 44.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03132433071732521, + "kl": 0.003305246355012059, + "learning_rate": 2.5336666666666666e-06, + "loss": 0.0002, + "num_tokens": 715502.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 44.46296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.561185359954834, + "kl": 0.04197950102388859, + "learning_rate": 2.5333333333333334e-06, + "loss": -0.0271, + "num_tokens": 715804.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 2401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 44.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00231740134768188, + "kl": 0.0011081545962952077, + "learning_rate": 2.533e-06, + "loss": 0.0001, + "num_tokens": 716084.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 44.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0033509547356516123, + "kl": 0.016214151866734028, + "learning_rate": 2.5326666666666665e-06, + "loss": 0.0008, + "num_tokens": 716344.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 44.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031681910157203674, + "kl": 0.0019194036722183228, + "learning_rate": 2.5323333333333337e-06, + "loss": 0.0001, + "num_tokens": 716556.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 44.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03134976699948311, + "kl": 0.002907024696469307, + "learning_rate": 2.532e-06, + "loss": 0.0001, + "num_tokens": 716868.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 44.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010179928503930569, + "kl": 0.00043118372559547424, + "learning_rate": 2.531666666666667e-06, + "loss": 0.0, + "num_tokens": 717112.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 44.574074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028442008420825005, + "kl": 0.0012857671245001256, + "learning_rate": 2.531333333333333e-06, + "loss": 0.0001, + "num_tokens": 717435.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 44.592592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056585948914289474, + "kl": 0.01396835083141923, + "learning_rate": 2.531e-06, + "loss": 0.0007, + "num_tokens": 717760.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 44.611111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019285930320620537, + "kl": 0.2654702961444855, + "learning_rate": 2.5306666666666668e-06, + "loss": 0.0133, + "num_tokens": 718064.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 30.75, + "completions/mean_terminated_length": 30.75, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 44.629629629629626, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.0360283851623535, + "kl": 0.1022005844861269, + "learning_rate": 2.5303333333333336e-06, + "loss": -0.0013, + "num_tokens": 718467.0, + "reward": 2.375, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 1.25, + "step": 2410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 44.648148148148145, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.005399227142334, + "kl": 0.019845019094645977, + "learning_rate": 2.5300000000000003e-06, + "loss": 0.1911, + "num_tokens": 718758.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 2411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 44.666666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031161511316895485, + "kl": 0.007949382066726685, + "learning_rate": 2.5296666666666667e-06, + "loss": 0.0004, + "num_tokens": 719026.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 44.68518518518518, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.131880760192871, + "kl": 0.0300514732953161, + "learning_rate": 2.5293333333333335e-06, + "loss": 0.0513, + "num_tokens": 719324.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 2413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 44.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11699317395687103, + "kl": 0.017543671652674675, + "learning_rate": 2.529e-06, + "loss": 0.0009, + "num_tokens": 719598.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 44.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0197089072316885, + "kl": 0.00172106281388551, + "learning_rate": 2.5286666666666666e-06, + "loss": 0.0001, + "num_tokens": 719866.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 44.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005962066818028688, + "kl": 0.0010440730256959796, + "learning_rate": 2.5283333333333334e-06, + "loss": 0.0001, + "num_tokens": 720178.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 44.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009605719707906246, + "kl": 0.003335796296596527, + "learning_rate": 2.528e-06, + "loss": 0.0002, + "num_tokens": 720394.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 44.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013398813083767891, + "kl": 0.016179578378796577, + "learning_rate": 2.5276666666666665e-06, + "loss": 0.0008, + "num_tokens": 720678.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 44.7962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.3021674156188965, + "kl": 0.11446773260831833, + "learning_rate": 2.5273333333333333e-06, + "loss": -0.1335, + "num_tokens": 721012.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 2419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 44.81481481481482, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.252238273620605, + "kl": 0.021312411059625447, + "learning_rate": 2.527e-06, + "loss": 0.1645, + "num_tokens": 721282.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 2420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 44.833333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07279378920793533, + "kl": 0.0004621744155883789, + "learning_rate": 2.5266666666666665e-06, + "loss": 0.0, + "num_tokens": 721494.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 44.851851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008217846974730492, + "kl": 0.0005680881440639496, + "learning_rate": 2.5263333333333337e-06, + "loss": 0.0, + "num_tokens": 721754.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 44.870370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2470250427722931, + "kl": 0.03967873938381672, + "learning_rate": 2.526e-06, + "loss": 0.002, + "num_tokens": 722092.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 44.888888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0047119478695094585, + "kl": 0.0016926114330999553, + "learning_rate": 2.525666666666667e-06, + "loss": 0.0001, + "num_tokens": 722376.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 44.907407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00697978725656867, + "kl": 0.0017822146764956415, + "learning_rate": 2.525333333333333e-06, + "loss": 0.0001, + "num_tokens": 722636.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 44.925925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10732169449329376, + "kl": 0.0029513761401176453, + "learning_rate": 2.525e-06, + "loss": 0.0001, + "num_tokens": 722855.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 44.94444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.628994941711426, + "kl": 0.37450834130868316, + "learning_rate": 2.5246666666666667e-06, + "loss": 0.0439, + "num_tokens": 723158.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 2427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 29.25, + "completions/mean_terminated_length": 29.25, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 44.96296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.197968006134033, + "kl": 0.04597779922187328, + "learning_rate": 2.5243333333333335e-06, + "loss": 0.0038, + "num_tokens": 723491.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 2428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 73.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 73.0, + "completions/mean_terminated_length": 73.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 44.98148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2310808897018433, + "kl": 0.019989359192550182, + "learning_rate": 2.5240000000000003e-06, + "loss": 0.4263, + "num_tokens": 724003.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 2429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 45.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15655246376991272, + "kl": 0.02584015391767025, + "learning_rate": 2.5236666666666667e-06, + "loss": 0.0013, + "num_tokens": 724289.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 45.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0297035351395607, + "kl": 0.007721581496298313, + "learning_rate": 2.5233333333333335e-06, + "loss": 0.0004, + "num_tokens": 724557.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 45.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03807724267244339, + "kl": 0.00707882852293551, + "learning_rate": 2.523e-06, + "loss": 0.0004, + "num_tokens": 724901.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 45.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009172759018838406, + "kl": 0.003193281590938568, + "learning_rate": 2.5226666666666666e-06, + "loss": 0.0002, + "num_tokens": 725117.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 45.074074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006181388162076473, + "kl": 0.0004881687054876238, + "learning_rate": 2.5223333333333334e-06, + "loss": 0.0, + "num_tokens": 725435.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 45.092592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18731491267681122, + "kl": 0.029865404590964317, + "learning_rate": 2.522e-06, + "loss": 0.0013, + "num_tokens": 725731.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 45.111111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02235090732574463, + "kl": 0.0006565302610397339, + "learning_rate": 2.5216666666666665e-06, + "loss": 0.0, + "num_tokens": 725943.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 29.5, + "completions/mean_terminated_length": 29.5, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 45.129629629629626, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.440337657928467, + "kl": 0.07701127231121063, + "learning_rate": 2.5213333333333333e-06, + "loss": -0.0576, + "num_tokens": 726277.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 2437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 45.148148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10574109107255936, + "kl": 0.02136577805504203, + "learning_rate": 2.521e-06, + "loss": 0.0011, + "num_tokens": 726551.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 45.166666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010525328107178211, + "kl": 0.001914285123348236, + "learning_rate": 2.520666666666667e-06, + "loss": 0.0001, + "num_tokens": 726787.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 45.18518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23817087709903717, + "kl": 0.04681949131190777, + "learning_rate": 2.5203333333333337e-06, + "loss": 0.0023, + "num_tokens": 727090.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 45.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09021669626235962, + "kl": 0.011988789541646838, + "learning_rate": 2.52e-06, + "loss": 0.0006, + "num_tokens": 727392.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 8.25, + "completions/mean_terminated_length": 8.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 45.22222222222222, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.986919403076172, + "kl": 0.2177446000277996, + "learning_rate": 2.519666666666667e-06, + "loss": 0.0397, + "num_tokens": 727653.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 2442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 45.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1099245473742485, + "kl": 0.0067427074536681175, + "learning_rate": 2.519333333333333e-06, + "loss": 0.0003, + "num_tokens": 727949.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 45.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005288613494485617, + "kl": 0.00045069254701957107, + "learning_rate": 2.519e-06, + "loss": 0.0, + "num_tokens": 728192.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 45.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16958598792552948, + "kl": 0.025498234666883945, + "learning_rate": 2.5186666666666667e-06, + "loss": 0.0011, + "num_tokens": 728457.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 45.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012915810570120811, + "kl": 0.005473255878314376, + "learning_rate": 2.5183333333333335e-06, + "loss": 0.0003, + "num_tokens": 728725.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 27.75, + "completions/mean_terminated_length": 27.75, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 45.31481481481482, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.935128927230835, + "kl": 0.06246868520975113, + "learning_rate": 2.5180000000000003e-06, + "loss": 0.0636, + "num_tokens": 729064.0, + "reward": 6.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 6.5, + "rewards/reward_combined/std": 2.0, + "step": 2447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 45.333333333333336, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.11857795715332, + "kl": 0.08278805017471313, + "learning_rate": 2.5176666666666666e-06, + "loss": 0.0594, + "num_tokens": 729402.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 2448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 45.351851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07405371963977814, + "kl": 0.0010067522525787354, + "learning_rate": 2.5173333333333334e-06, + "loss": 0.0001, + "num_tokens": 729614.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 45.370370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0034407759085297585, + "kl": 0.016221345402300358, + "learning_rate": 2.5169999999999998e-06, + "loss": 0.0008, + "num_tokens": 729874.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 45.388888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014748414047062397, + "kl": 0.0010713711380958557, + "learning_rate": 2.516666666666667e-06, + "loss": 0.0001, + "num_tokens": 730198.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 45.407407407407405, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.9868483543396, + "kl": 0.040062980726361275, + "learning_rate": 2.5163333333333334e-06, + "loss": -0.0005, + "num_tokens": 730490.0, + "reward": 4.625, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 2452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 45.425925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.28041645884513855, + "kl": 0.031283190473914146, + "learning_rate": 2.516e-06, + "loss": 0.0016, + "num_tokens": 730793.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 45.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12332356721162796, + "kl": 0.1614207997918129, + "learning_rate": 2.515666666666667e-06, + "loss": 0.0081, + "num_tokens": 731102.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 28.75, + "completions/mean_terminated_length": 28.75, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 45.46296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.182009220123291, + "kl": 0.07591994479298592, + "learning_rate": 2.5153333333333333e-06, + "loss": 0.0185, + "num_tokens": 731437.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 2455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 45.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010616044513881207, + "kl": 0.0021869930205866694, + "learning_rate": 2.515e-06, + "loss": 0.0001, + "num_tokens": 731721.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 45.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0117707634344697, + "kl": 0.0005625975900329649, + "learning_rate": 2.514666666666667e-06, + "loss": 0.0, + "num_tokens": 731956.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 45.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1691562533378601, + "kl": 0.003987833857536316, + "learning_rate": 2.5143333333333336e-06, + "loss": 0.0002, + "num_tokens": 732168.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 45.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0167376808822155, + "kl": 0.2660384327173233, + "learning_rate": 2.514e-06, + "loss": 0.0133, + "num_tokens": 732472.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 45.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.33966556191444397, + "kl": 0.027728529879823327, + "learning_rate": 2.5136666666666668e-06, + "loss": 0.0014, + "num_tokens": 732737.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 45.574074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007064864505082369, + "kl": 0.0018771738396026194, + "learning_rate": 2.513333333333333e-06, + "loss": 0.0001, + "num_tokens": 732997.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 45.592592592592595, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0757845640182495, + "kl": 0.04385972023010254, + "learning_rate": 2.513e-06, + "loss": 0.0002, + "num_tokens": 733309.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 2462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 45.611111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13577602803707123, + "kl": 0.014645958319306374, + "learning_rate": 2.5126666666666667e-06, + "loss": 0.0007, + "num_tokens": 733587.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 45.629629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0038021355867385864, + "kl": 0.0007932271109893918, + "learning_rate": 2.5123333333333335e-06, + "loss": 0.0, + "num_tokens": 733867.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 45.648148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0465419702231884, + "kl": 0.005470138741657138, + "learning_rate": 2.5120000000000003e-06, + "loss": 0.0003, + "num_tokens": 734157.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 47.0, + "completions/max_terminated_length": 47.0, + "completions/mean_length": 29.75, + "completions/mean_terminated_length": 29.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 45.666666666666664, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7946746349334717, + "kl": 0.07041575387120247, + "learning_rate": 2.5116666666666666e-06, + "loss": 0.1658, + "num_tokens": 734512.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 2466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 45.68518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.479394793510437, + "kl": 0.17642395664006472, + "learning_rate": 2.5113333333333334e-06, + "loss": 0.0094, + "num_tokens": 734798.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 45.7037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.376974582672119, + "kl": 0.007895383670984302, + "learning_rate": 2.5109999999999998e-06, + "loss": 0.0378, + "num_tokens": 735072.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 2468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 45.72222222222222, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.2208967208862305, + "kl": 0.1544065736234188, + "learning_rate": 2.510666666666667e-06, + "loss": 0.0481, + "num_tokens": 735413.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 2469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 48.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 48.0, + "completions/mean_terminated_length": 48.0, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 45.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045347291976213455, + "kl": 0.01228410005569458, + "learning_rate": 2.5103333333333333e-06, + "loss": 0.0006, + "num_tokens": 735825.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 45.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.142083078622818, + "kl": 0.025195241440087557, + "learning_rate": 2.51e-06, + "loss": 0.0015, + "num_tokens": 736111.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 45.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03137398883700371, + "kl": 0.09449802339076996, + "learning_rate": 2.509666666666667e-06, + "loss": 0.0047, + "num_tokens": 736475.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 45.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0033673427533358335, + "kl": 9.293853509007022e-05, + "learning_rate": 2.5093333333333333e-06, + "loss": 0.0, + "num_tokens": 736731.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 45.81481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03953562676906586, + "kl": 0.00045480579137802124, + "learning_rate": 2.509e-06, + "loss": 0.0, + "num_tokens": 736951.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 2474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.5, + "completions/mean_terminated_length": 5.5, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 45.833333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09735623747110367, + "kl": 0.004153555637458339, + "learning_rate": 2.508666666666667e-06, + "loss": 0.0002, + "num_tokens": 737173.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 45.851851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15920746326446533, + "kl": 0.031207595951855183, + "learning_rate": 2.5083333333333336e-06, + "loss": 0.0016, + "num_tokens": 737442.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 45.870370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5551946759223938, + "kl": 0.03656729869544506, + "learning_rate": 2.508e-06, + "loss": 0.0018, + "num_tokens": 737738.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 45.888888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03540947660803795, + "kl": 0.0070354241179302335, + "learning_rate": 2.5076666666666667e-06, + "loss": 0.0004, + "num_tokens": 738061.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 45.907407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.33373937010765076, + "kl": 0.018233067821711302, + "learning_rate": 2.507333333333333e-06, + "loss": 0.0008, + "num_tokens": 738325.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 48.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 90.0, + "completions/max_terminated_length": 90.0, + "completions/mean_length": 48.0, + "completions/mean_terminated_length": 48.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 45.925925925925924, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7227039337158203, + "kl": 0.018040990456938744, + "learning_rate": 2.507e-06, + "loss": 0.2375, + "num_tokens": 738741.0, + "reward": 7.375, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.375, + "rewards/reward_combined/std": 0.25, + "step": 2480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 45.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11966013163328171, + "kl": 0.021700285375118256, + "learning_rate": 2.506666666666667e-06, + "loss": 0.0011, + "num_tokens": 739065.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 45.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2690456807613373, + "kl": 0.02911150682484731, + "learning_rate": 2.5063333333333334e-06, + "loss": 0.0015, + "num_tokens": 739377.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 45.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006945215165615082, + "kl": 0.0007221311389002949, + "learning_rate": 2.5060000000000002e-06, + "loss": 0.0, + "num_tokens": 739637.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 33.0, + "completions/mean_terminated_length": 33.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 46.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0390542708337307, + "kl": 0.039667438715696335, + "learning_rate": 2.5056666666666666e-06, + "loss": 0.0019, + "num_tokens": 740049.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 2484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 46.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011404343240428716, + "kl": 3.598630428314209e-06, + "learning_rate": 2.5053333333333334e-06, + "loss": 0.0, + "num_tokens": 740269.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 2485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 46.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009822768159210682, + "kl": 0.004389554378576577, + "learning_rate": 2.505e-06, + "loss": 0.0002, + "num_tokens": 740539.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 46.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015896078199148178, + "kl": 0.2662335932254791, + "learning_rate": 2.504666666666667e-06, + "loss": 0.0133, + "num_tokens": 740843.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 46.074074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05427286773920059, + "kl": 0.006968255620449781, + "learning_rate": 2.5043333333333333e-06, + "loss": 0.0003, + "num_tokens": 741146.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 46.092592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003982073627412319, + "kl": 0.0006556894222740084, + "learning_rate": 2.504e-06, + "loss": 0.0, + "num_tokens": 741426.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 46.111111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0342152938246727, + "kl": 0.00826589995995164, + "learning_rate": 2.503666666666667e-06, + "loss": 0.0004, + "num_tokens": 741714.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 46.129629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08570891618728638, + "kl": 0.0028475001454353333, + "learning_rate": 2.5033333333333332e-06, + "loss": 0.0001, + "num_tokens": 741922.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 46.148148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06338085234165192, + "kl": 0.017430400475859642, + "learning_rate": 2.503e-06, + "loss": 0.0008, + "num_tokens": 742243.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 46.166666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0035878820344805717, + "kl": 0.016196363605558872, + "learning_rate": 2.502666666666667e-06, + "loss": 0.0008, + "num_tokens": 742503.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 46.18518518518518, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.096258640289307, + "kl": 0.12530342489480972, + "learning_rate": 2.5023333333333336e-06, + "loss": 0.1053, + "num_tokens": 742829.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 2494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.0, + "completions/max_terminated_length": 46.0, + "completions/mean_length": 34.25, + "completions/mean_terminated_length": 34.25, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 46.2037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.535659074783325, + "kl": 0.02935294434428215, + "learning_rate": 2.502e-06, + "loss": 0.119, + "num_tokens": 743182.0, + "reward": 4.625, + "reward_std": 2.25, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 2.25, + "step": 2495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 46.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.26846736669540405, + "kl": 0.03729828912764788, + "learning_rate": 2.5016666666666667e-06, + "loss": 0.0019, + "num_tokens": 743456.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 46.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3158177435398102, + "kl": 0.026865395164350048, + "learning_rate": 2.501333333333333e-06, + "loss": 0.0014, + "num_tokens": 743717.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 29.5, + "completions/mean_terminated_length": 29.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 46.25925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.376829147338867, + "kl": 0.0505690579302609, + "learning_rate": 2.501e-06, + "loss": -0.0717, + "num_tokens": 744063.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 2498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.5, + "completions/mean_terminated_length": 7.5, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 46.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.32744091749191284, + "kl": 0.018593482207506895, + "learning_rate": 2.500666666666667e-06, + "loss": 0.0009, + "num_tokens": 744305.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 46.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012301038019359112, + "kl": 0.0015533939003944397, + "learning_rate": 2.5003333333333334e-06, + "loss": 0.0001, + "num_tokens": 744541.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 46.31481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16265687346458435, + "kl": 0.023410575464367867, + "learning_rate": 2.5e-06, + "loss": 0.0012, + "num_tokens": 744813.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 12.25, + "completions/mean_terminated_length": 12.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 46.333333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1362433135509491, + "kl": 0.018554782029241323, + "learning_rate": 2.4996666666666666e-06, + "loss": 0.001, + "num_tokens": 745086.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 46.351851851851855, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.151635646820068, + "kl": 0.05165325850248337, + "learning_rate": 2.4993333333333333e-06, + "loss": -0.0348, + "num_tokens": 745385.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 2503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 46.370370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.7361879348754883, + "kl": 0.25426632445305586, + "learning_rate": 2.499e-06, + "loss": 0.015, + "num_tokens": 745697.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 46.388888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035286128520965576, + "kl": 0.001969415054190904, + "learning_rate": 2.498666666666667e-06, + "loss": 0.0001, + "num_tokens": 745963.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 46.407407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12127512693405151, + "kl": 0.007447681622579694, + "learning_rate": 2.4983333333333333e-06, + "loss": 0.0004, + "num_tokens": 746259.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 30.0, + "completions/mean_terminated_length": 30.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 46.425925925925924, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.530540466308594, + "kl": 0.15745895355939865, + "learning_rate": 2.498e-06, + "loss": -0.0343, + "num_tokens": 746631.0, + "reward": 4.125, + "reward_std": 3.902456521987915, + "rewards/reward_combined/mean": 4.125, + "rewards/reward_combined/std": 3.902456521987915, + "step": 2507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 43.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 43.75, + "completions/mean_terminated_length": 43.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 46.44444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.138972282409668, + "kl": 0.07470146007835865, + "learning_rate": 2.497666666666667e-06, + "loss": 0.0621, + "num_tokens": 747026.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 2508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 26.25, + "completions/mean_terminated_length": 26.25, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 46.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7906563878059387, + "kl": 0.14853118360042572, + "learning_rate": 2.497333333333333e-06, + "loss": 0.0077, + "num_tokens": 747359.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 45.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 74.0, + "completions/max_terminated_length": 74.0, + "completions/mean_length": 45.25, + "completions/mean_terminated_length": 45.25, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 46.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6633254885673523, + "kl": 0.08115752972662449, + "learning_rate": 2.497e-06, + "loss": 0.0049, + "num_tokens": 747764.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 46.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.39222899079322815, + "kl": 0.04715419188141823, + "learning_rate": 2.4966666666666668e-06, + "loss": 0.0024, + "num_tokens": 748096.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 46.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01646861806511879, + "kl": 0.0010175041970796883, + "learning_rate": 2.4963333333333335e-06, + "loss": 0.0001, + "num_tokens": 748358.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 46.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042602717876434326, + "kl": 0.004759176634252071, + "learning_rate": 2.496e-06, + "loss": 0.0002, + "num_tokens": 748670.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 26.75, + "completions/mean_terminated_length": 26.75, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 46.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3041357398033142, + "kl": 0.06130489706993103, + "learning_rate": 2.4956666666666667e-06, + "loss": 0.0032, + "num_tokens": 748997.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 46.574074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007603069767355919, + "kl": 8.128583431243896e-05, + "learning_rate": 2.495333333333333e-06, + "loss": 0.0, + "num_tokens": 749209.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 46.592592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0253154207020998, + "kl": 0.09312248229980469, + "learning_rate": 2.4950000000000003e-06, + "loss": 0.0047, + "num_tokens": 749575.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 46.611111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11499327421188354, + "kl": 0.014323872746899724, + "learning_rate": 2.494666666666667e-06, + "loss": 0.0008, + "num_tokens": 749919.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 46.629629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1348366141319275, + "kl": 0.005608153063803911, + "learning_rate": 2.4943333333333334e-06, + "loss": 0.0003, + "num_tokens": 750195.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 46.648148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04203183576464653, + "kl": 0.035331493243575096, + "learning_rate": 2.494e-06, + "loss": 0.0019, + "num_tokens": 750485.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 46.666666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19115984439849854, + "kl": 0.02633888367563486, + "learning_rate": 2.4936666666666665e-06, + "loss": 0.0013, + "num_tokens": 750783.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 46.68518518518518, + "frac_reward_zero_std": 0.0, + "grad_norm": 18.35003662109375, + "kl": 0.2072160392999649, + "learning_rate": 2.4933333333333333e-06, + "loss": -0.2274, + "num_tokens": 751000.0, + "reward": 3.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 3.875, + "rewards/reward_combined/std": 0.25, + "step": 2521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 46.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001844356651417911, + "kl": 0.0004361694009276107, + "learning_rate": 2.493e-06, + "loss": 0.0, + "num_tokens": 751272.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 46.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004649453330785036, + "kl": 0.001757514022756368, + "learning_rate": 2.492666666666667e-06, + "loss": 0.0001, + "num_tokens": 751556.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 46.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1287909746170044, + "kl": 0.016881283838301897, + "learning_rate": 2.4923333333333332e-06, + "loss": 0.0008, + "num_tokens": 751834.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 46.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2693098187446594, + "kl": 0.030407872691284865, + "learning_rate": 2.492e-06, + "loss": 0.0014, + "num_tokens": 752143.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 46.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1576777994632721, + "kl": 0.01178092899499461, + "learning_rate": 2.491666666666667e-06, + "loss": 0.0006, + "num_tokens": 752462.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 46.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01044461503624916, + "kl": 0.0028221234679222107, + "learning_rate": 2.491333333333333e-06, + "loss": 0.0001, + "num_tokens": 752678.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 9.75, + "completions/mean_terminated_length": 9.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 46.81481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08084384351968765, + "kl": 0.008157667936757207, + "learning_rate": 2.4910000000000004e-06, + "loss": 0.0005, + "num_tokens": 752937.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 46.833333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09120894968509674, + "kl": 0.004084435146069154, + "learning_rate": 2.4906666666666667e-06, + "loss": 0.0002, + "num_tokens": 753193.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 46.851851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0273737870156765, + "kl": 0.0009900123986881226, + "learning_rate": 2.4903333333333335e-06, + "loss": 0.0, + "num_tokens": 753511.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 46.870370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010336839593946934, + "kl": 0.009016639087349176, + "learning_rate": 2.49e-06, + "loss": 0.0005, + "num_tokens": 753783.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 46.888888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11457367241382599, + "kl": 0.010811601998284459, + "learning_rate": 2.4896666666666667e-06, + "loss": 0.0006, + "num_tokens": 754080.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 46.907407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02475919760763645, + "kl": 0.008598325541242957, + "learning_rate": 2.4893333333333334e-06, + "loss": 0.0004, + "num_tokens": 754384.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 46.925925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0379536934196949, + "kl": 0.044369883835315704, + "learning_rate": 2.4890000000000002e-06, + "loss": 0.0022, + "num_tokens": 754788.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 2534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 46.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04465353861451149, + "kl": 0.001675780862569809, + "learning_rate": 2.488666666666667e-06, + "loss": 0.0001, + "num_tokens": 755048.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 46.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1090872511267662, + "kl": 0.005680212285369635, + "learning_rate": 2.4883333333333334e-06, + "loss": 0.0003, + "num_tokens": 755261.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 29.0, + "completions/mean_terminated_length": 29.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 46.98148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7724571228027344, + "kl": 0.0728769488632679, + "learning_rate": 2.488e-06, + "loss": 0.0889, + "num_tokens": 755613.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 2537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 47.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3414354920387268, + "kl": 0.02498954487964511, + "learning_rate": 2.4876666666666665e-06, + "loss": 0.0012, + "num_tokens": 755848.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 47.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039831146597862244, + "kl": 0.09290435910224915, + "learning_rate": 2.4873333333333333e-06, + "loss": 0.0046, + "num_tokens": 756212.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 47.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031223390251398087, + "kl": 0.0023416792973876, + "learning_rate": 2.487e-06, + "loss": 0.0001, + "num_tokens": 756482.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 47.05555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1879866123199463, + "kl": 0.5608605849556625, + "learning_rate": 2.486666666666667e-06, + "loss": 0.0271, + "num_tokens": 756774.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 2541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 47.074074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007637239061295986, + "kl": 0.0013469458208419383, + "learning_rate": 2.4863333333333332e-06, + "loss": 0.0001, + "num_tokens": 756994.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 47.092592592592595, + "frac_reward_zero_std": 0.0, + "grad_norm": 16.00135612487793, + "kl": 0.01819664239883423, + "learning_rate": 2.486e-06, + "loss": 0.0077, + "num_tokens": 757202.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 2543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 47.111111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1345706433057785, + "kl": 0.007921293145045638, + "learning_rate": 2.4856666666666668e-06, + "loss": 0.0005, + "num_tokens": 757548.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 47.129629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017033517360687256, + "kl": 0.0006393283547367901, + "learning_rate": 2.485333333333333e-06, + "loss": 0.0, + "num_tokens": 757816.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 47.148148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010561208240687847, + "kl": 0.0025972798466682434, + "learning_rate": 2.4850000000000003e-06, + "loss": 0.0001, + "num_tokens": 758032.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 47.166666666666664, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.55400276184082, + "kl": 0.024325484409928322, + "learning_rate": 2.4846666666666667e-06, + "loss": 0.052, + "num_tokens": 758360.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 2547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 47.18518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0022138182539492846, + "kl": 0.0003915958950528875, + "learning_rate": 2.4843333333333335e-06, + "loss": 0.0, + "num_tokens": 758672.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 47.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5684993267059326, + "kl": 0.11294364742934704, + "learning_rate": 2.484e-06, + "loss": 0.006, + "num_tokens": 758982.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 47.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007175232283771038, + "kl": 0.0003366991877555847, + "learning_rate": 2.4836666666666666e-06, + "loss": 0.0, + "num_tokens": 759226.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2550 + }, + { + "clip_ratio/high_max": 0.009999999776482582, + "clip_ratio/high_mean": 0.009999999776482582, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009999999776482582, + "completion_length": 28.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 28.0, + "completions/mean_terminated_length": 28.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 47.24074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.513923406600952, + "kl": 0.05781305208802223, + "learning_rate": 2.4833333333333334e-06, + "loss": 0.0348, + "num_tokens": 759558.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 2551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 36.25, + "completions/mean_terminated_length": 36.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 47.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.342684030532837, + "kl": 0.39348478708416224, + "learning_rate": 2.483e-06, + "loss": 0.0149, + "num_tokens": 759923.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 47.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04920942336320877, + "kl": 0.15138857811689377, + "learning_rate": 2.482666666666667e-06, + "loss": 0.0076, + "num_tokens": 760233.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2553 + }, + { + "clip_ratio/high_max": 0.01666666753590107, + "clip_ratio/high_mean": 0.01666666753590107, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01666666753590107, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 47.2962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.717015266418457, + "kl": 0.0757996179163456, + "learning_rate": 2.4823333333333333e-06, + "loss": 0.016, + "num_tokens": 760554.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 2554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 47.31481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03699135035276413, + "kl": 0.00045265257358551025, + "learning_rate": 2.482e-06, + "loss": 0.0, + "num_tokens": 760774.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 2555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 29.25, + "completions/mean_terminated_length": 29.25, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 47.333333333333336, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.284163236618042, + "kl": 0.04800225980579853, + "learning_rate": 2.4816666666666665e-06, + "loss": -0.0975, + "num_tokens": 761119.0, + "reward": 7.625, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.625, + "rewards/reward_combined/std": 0.25, + "step": 2556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 47.351851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06733513623476028, + "kl": 0.0052927323267795146, + "learning_rate": 2.4813333333333333e-06, + "loss": 0.0003, + "num_tokens": 761447.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 47.370370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0070976922288537025, + "kl": 0.0002507045865058899, + "learning_rate": 2.481e-06, + "loss": 0.0, + "num_tokens": 761659.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 47.388888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1926703006029129, + "kl": 0.021734744776040316, + "learning_rate": 2.480666666666667e-06, + "loss": 0.0011, + "num_tokens": 761953.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 47.407407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1924740970134735, + "kl": 0.027753588743507862, + "learning_rate": 2.480333333333333e-06, + "loss": 0.0015, + "num_tokens": 762235.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 47.425925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016208812594413757, + "kl": 0.002021443098783493, + "learning_rate": 2.48e-06, + "loss": 0.0001, + "num_tokens": 762547.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 47.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05852117761969566, + "kl": 0.010034375358372927, + "learning_rate": 2.4796666666666668e-06, + "loss": 0.0005, + "num_tokens": 762868.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 47.46296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.275321006774902, + "kl": 0.20327900350093842, + "learning_rate": 2.4793333333333335e-06, + "loss": 0.0253, + "num_tokens": 763173.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 2563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 47.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024304278194904327, + "kl": 0.0016468060784973204, + "learning_rate": 2.4790000000000003e-06, + "loss": 0.0001, + "num_tokens": 763453.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 47.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0864797979593277, + "kl": 0.020508273504674435, + "learning_rate": 2.4786666666666667e-06, + "loss": 0.001, + "num_tokens": 763726.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 47.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4386816620826721, + "kl": 0.06215875409543514, + "learning_rate": 2.4783333333333335e-06, + "loss": 0.0031, + "num_tokens": 764131.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 2566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 47.53703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4438295364379883, + "kl": 0.47084836941212416, + "learning_rate": 2.478e-06, + "loss": 0.036, + "num_tokens": 764466.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 2567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 47.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007470057345926762, + "kl": 0.0007480502245016396, + "learning_rate": 2.4776666666666666e-06, + "loss": 0.0, + "num_tokens": 764726.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 47.574074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03928351774811745, + "kl": 0.007036558818072081, + "learning_rate": 2.4773333333333334e-06, + "loss": 0.0004, + "num_tokens": 764998.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 47.592592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007212792988866568, + "kl": 5.517899990081787e-05, + "learning_rate": 2.477e-06, + "loss": 0.0, + "num_tokens": 765210.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 35.5, + "completions/mean_terminated_length": 35.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 47.611111111111114, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9965059757232666, + "kl": 0.035281239077448845, + "learning_rate": 2.476666666666667e-06, + "loss": 0.0476, + "num_tokens": 765576.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 2571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 47.629629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00748828612267971, + "kl": 0.015323393978178501, + "learning_rate": 2.4763333333333333e-06, + "loss": 0.0008, + "num_tokens": 765836.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 26.25, + "completions/mean_terminated_length": 26.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 47.648148148148145, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.799642562866211, + "kl": 0.06395893171429634, + "learning_rate": 2.476e-06, + "loss": -0.0077, + "num_tokens": 766177.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 2573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 47.666666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20798420906066895, + "kl": 0.020984972827136517, + "learning_rate": 2.4756666666666665e-06, + "loss": 0.0011, + "num_tokens": 766467.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 47.68518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013636157847940922, + "kl": 0.0006756596267223358, + "learning_rate": 2.4753333333333332e-06, + "loss": 0.0, + "num_tokens": 766727.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 47.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8445509672164917, + "kl": 0.09059044159948826, + "learning_rate": 2.475e-06, + "loss": 0.0049, + "num_tokens": 767003.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 47.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017572080250829458, + "kl": 0.0007877677562646568, + "learning_rate": 2.474666666666667e-06, + "loss": 0.0, + "num_tokens": 767263.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 47.74074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.777828216552734, + "kl": 0.015769362449645996, + "learning_rate": 2.4743333333333336e-06, + "loss": -0.0103, + "num_tokens": 767589.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 2578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 47.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050758421421051025, + "kl": 0.006462006596848369, + "learning_rate": 2.474e-06, + "loss": 0.0003, + "num_tokens": 767893.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 47.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011461479589343071, + "kl": 0.008317825384438038, + "learning_rate": 2.4736666666666667e-06, + "loss": 0.0004, + "num_tokens": 768165.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 47.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08216534554958344, + "kl": 0.0032526047143619508, + "learning_rate": 2.4733333333333335e-06, + "loss": 0.0002, + "num_tokens": 768421.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 47.81481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010482510551810265, + "kl": 0.006229990627616644, + "learning_rate": 2.4730000000000003e-06, + "loss": 0.0003, + "num_tokens": 768710.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 47.833333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01895684376358986, + "kl": 0.0023190357023850083, + "learning_rate": 2.4726666666666667e-06, + "loss": 0.0001, + "num_tokens": 768994.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 7.0, + "completions/mean_terminated_length": 7.0, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 47.851851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04171139374375343, + "kl": 0.002461825031787157, + "learning_rate": 2.4723333333333334e-06, + "loss": 0.0001, + "num_tokens": 769230.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 47.870370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007956587709486485, + "kl": 0.0022862255573272705, + "learning_rate": 2.472e-06, + "loss": 0.0001, + "num_tokens": 769466.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 29.0, + "completions/mean_terminated_length": 29.0, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 47.888888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.120576873421669, + "kl": 0.02623407356441021, + "learning_rate": 2.4716666666666666e-06, + "loss": 0.0013, + "num_tokens": 769798.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 47.907407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1029219850897789, + "kl": 0.015235194936394691, + "learning_rate": 2.4713333333333334e-06, + "loss": 0.0006, + "num_tokens": 770090.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 47.925925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04186006635427475, + "kl": 0.0017276121652685106, + "learning_rate": 2.471e-06, + "loss": 0.0001, + "num_tokens": 770386.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 47.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05727760121226311, + "kl": 0.014936825260519981, + "learning_rate": 2.470666666666667e-06, + "loss": 0.0007, + "num_tokens": 770646.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 47.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036273978650569916, + "kl": 0.007272610906511545, + "learning_rate": 2.4703333333333333e-06, + "loss": 0.0004, + "num_tokens": 770916.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 47.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12503013014793396, + "kl": 0.021614138036966324, + "learning_rate": 2.47e-06, + "loss": 0.0011, + "num_tokens": 771216.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 48.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010718362173065543, + "kl": 0.0012030623038299382, + "learning_rate": 2.4696666666666664e-06, + "loss": 0.0001, + "num_tokens": 771496.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 48.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0038712776731699705, + "kl": 0.0004808281664736569, + "learning_rate": 2.4693333333333336e-06, + "loss": 0.0, + "num_tokens": 771768.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 48.03703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.828770637512207, + "kl": 0.10881831945152953, + "learning_rate": 2.469e-06, + "loss": -0.0426, + "num_tokens": 772026.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 2594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 48.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00795852392911911, + "kl": 0.0023187175393104553, + "learning_rate": 2.4686666666666668e-06, + "loss": 0.0001, + "num_tokens": 772262.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 48.074074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06079699099063873, + "kl": 0.010363928508013487, + "learning_rate": 2.4683333333333336e-06, + "loss": 0.0005, + "num_tokens": 772566.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 48.092592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012865503318607807, + "kl": 0.007851775735616684, + "learning_rate": 2.468e-06, + "loss": 0.0004, + "num_tokens": 772838.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 48.111111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24803395569324493, + "kl": 0.035253395326435566, + "learning_rate": 2.4676666666666667e-06, + "loss": 0.0019, + "num_tokens": 773119.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.75, + "completions/mean_terminated_length": 10.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 48.129629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03238431736826897, + "kl": 0.00431145797483623, + "learning_rate": 2.4673333333333335e-06, + "loss": 0.0002, + "num_tokens": 773390.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 48.148148148148145, + "frac_reward_zero_std": 0.0, + "grad_norm": 137.25222778320312, + "kl": 10.027743226848543, + "learning_rate": 2.4670000000000003e-06, + "loss": 0.5305, + "num_tokens": 773681.0, + "reward": 5.75, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 4.5, + "step": 2600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 48.166666666666664, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.340005397796631, + "kl": 0.0995195247232914, + "learning_rate": 2.4666666666666666e-06, + "loss": 0.0852, + "num_tokens": 773979.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 2601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 48.18518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23584656417369843, + "kl": 0.04996185004711151, + "learning_rate": 2.4663333333333334e-06, + "loss": 0.0027, + "num_tokens": 774299.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 48.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2928851544857025, + "kl": 0.026695653796195984, + "learning_rate": 2.4659999999999998e-06, + "loss": 0.0014, + "num_tokens": 774585.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 48.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3870152533054352, + "kl": 0.05899230018258095, + "learning_rate": 2.4656666666666666e-06, + "loss": 0.0029, + "num_tokens": 774873.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 48.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11196646094322205, + "kl": 0.020365355536341667, + "learning_rate": 2.4653333333333338e-06, + "loss": 0.001, + "num_tokens": 775210.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 48.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006418376229703426, + "kl": 0.0004412122070789337, + "learning_rate": 2.465e-06, + "loss": 0.0, + "num_tokens": 775454.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 48.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012934365309774876, + "kl": 0.014251023530960083, + "learning_rate": 2.464666666666667e-06, + "loss": 0.0007, + "num_tokens": 775714.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 48.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019527660682797432, + "kl": 0.0028348437044769526, + "learning_rate": 2.4643333333333333e-06, + "loss": 0.0001, + "num_tokens": 775994.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 48.31481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006592772901058197, + "kl": 0.0004989169538021088, + "learning_rate": 2.464e-06, + "loss": 0.0, + "num_tokens": 776254.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 48.333333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03467896953225136, + "kl": 0.007223922293633223, + "learning_rate": 2.463666666666667e-06, + "loss": 0.0004, + "num_tokens": 776524.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 48.351851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016406308859586716, + "kl": 0.0010848395177163184, + "learning_rate": 2.4633333333333336e-06, + "loss": 0.0001, + "num_tokens": 776849.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 48.370370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047961898148059845, + "kl": 0.002674810995813459, + "learning_rate": 2.463e-06, + "loss": 0.0001, + "num_tokens": 777123.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 34.0, + "completions/mean_terminated_length": 34.0, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 48.388888888888886, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.209963321685791, + "kl": 0.04117728769779205, + "learning_rate": 2.4626666666666667e-06, + "loss": -0.035, + "num_tokens": 777483.0, + "reward": 5.25, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 2.598076105117798, + "step": 2613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 48.407407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03381167724728584, + "kl": 0.019356227945536375, + "learning_rate": 2.4623333333333335e-06, + "loss": 0.001, + "num_tokens": 777760.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 48.425925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05479121953248978, + "kl": 0.00934884324669838, + "learning_rate": 2.462e-06, + "loss": 0.0005, + "num_tokens": 778042.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 48.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08005852997303009, + "kl": 0.01333491737022996, + "learning_rate": 2.4616666666666667e-06, + "loss": 0.0007, + "num_tokens": 778365.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 48.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07427375018596649, + "kl": 0.0055592358112335205, + "learning_rate": 2.4613333333333335e-06, + "loss": 0.0003, + "num_tokens": 778581.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 80.5, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 80.5, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 48.48148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4323911666870117, + "kl": 0.023786271922290325, + "learning_rate": 2.4610000000000002e-06, + "loss": 0.481, + "num_tokens": 779127.0, + "reward": 4.425000190734863, + "reward_std": 3.9609551429748535, + "rewards/reward_combined/mean": 4.425000190734863, + "rewards/reward_combined/std": 3.9609553813934326, + "step": 2618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 48.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00801610667258501, + "kl": 0.0007882237550802529, + "learning_rate": 2.4606666666666666e-06, + "loss": 0.0, + "num_tokens": 779347.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 48.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003652064478956163, + "kl": 2.3402273654937744e-05, + "learning_rate": 2.4603333333333334e-06, + "loss": 0.0, + "num_tokens": 779567.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 2620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 32.0, + "completions/mean_terminated_length": 32.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 48.53703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.588613986968994, + "kl": 0.05327329598367214, + "learning_rate": 2.4599999999999997e-06, + "loss": 0.1191, + "num_tokens": 779931.0, + "reward": 3.625, + "reward_std": 2.75, + "rewards/reward_combined/mean": 3.625, + "rewards/reward_combined/std": 2.75, + "step": 2621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.5, + "completions/mean_terminated_length": 4.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 48.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19643938541412354, + "kl": 0.00969262095168233, + "learning_rate": 2.4596666666666665e-06, + "loss": 0.0005, + "num_tokens": 780145.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 48.574074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02397293969988823, + "kl": 0.006498393137007952, + "learning_rate": 2.4593333333333337e-06, + "loss": 0.0003, + "num_tokens": 780433.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 48.592592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6208887100219727, + "kl": 0.058538658544421196, + "learning_rate": 2.459e-06, + "loss": 0.0027, + "num_tokens": 780763.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 48.611111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0708281397819519, + "kl": 0.0025162369711324573, + "learning_rate": 2.458666666666667e-06, + "loss": 0.0001, + "num_tokens": 781019.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 48.629629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010458112694323063, + "kl": 0.0020992299541831017, + "learning_rate": 2.4583333333333332e-06, + "loss": 0.0001, + "num_tokens": 781331.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 48.648148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06427004933357239, + "kl": 0.003297789953649044, + "learning_rate": 2.458e-06, + "loss": 0.0002, + "num_tokens": 781598.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 48.666666666666664, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.400606393814087, + "kl": 0.09866466373205185, + "learning_rate": 2.457666666666667e-06, + "loss": 0.0915, + "num_tokens": 781954.0, + "reward": 5.5, + "reward_std": 2.309401035308838, + "rewards/reward_combined/mean": 5.5, + "rewards/reward_combined/std": 2.309401035308838, + "step": 2628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 2.5, + "completions/mean_terminated_length": 2.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 48.68518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2850549817085266, + "kl": 0.03583748638629913, + "learning_rate": 2.4573333333333336e-06, + "loss": 0.0017, + "num_tokens": 782160.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 48.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012169628404080868, + "kl": 0.2668505907058716, + "learning_rate": 2.457e-06, + "loss": 0.0133, + "num_tokens": 782464.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 48.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007750585209578276, + "kl": 0.0011209641816094518, + "learning_rate": 2.4566666666666667e-06, + "loss": 0.0001, + "num_tokens": 782760.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 48.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010310211218893528, + "kl": 0.00013162195682525635, + "learning_rate": 2.4563333333333335e-06, + "loss": 0.0, + "num_tokens": 782972.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 30.0, + "completions/mean_terminated_length": 30.0, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 48.75925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9040417671203613, + "kl": 0.06567835807800293, + "learning_rate": 2.456e-06, + "loss": -0.0099, + "num_tokens": 783320.0, + "reward": 5.375, + "reward_std": 2.75, + "rewards/reward_combined/mean": 5.375, + "rewards/reward_combined/std": 2.75, + "step": 2633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 48.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017361639067530632, + "kl": 0.0015062980819493532, + "learning_rate": 2.4556666666666666e-06, + "loss": 0.0001, + "num_tokens": 783555.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2634 + }, + { + "clip_ratio/high_max": 0.007575757801532745, + "clip_ratio/high_mean": 0.007575757801532745, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007575757801532745, + "completion_length": 36.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 36.5, + "completions/mean_terminated_length": 36.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 48.7962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.701044082641602, + "kl": 0.05855248123407364, + "learning_rate": 2.4553333333333334e-06, + "loss": 0.0993, + "num_tokens": 783917.0, + "reward": 2.25, + "reward_std": 1.443375587463379, + "rewards/reward_combined/mean": 2.25, + "rewards/reward_combined/std": 1.4433757066726685, + "step": 2635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 48.81481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03841494768857956, + "kl": 0.002091391012072563, + "learning_rate": 2.4550000000000002e-06, + "loss": 0.0001, + "num_tokens": 784233.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 48.833333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12177629768848419, + "kl": 0.05128871090710163, + "learning_rate": 2.4546666666666666e-06, + "loss": 0.0026, + "num_tokens": 784531.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 24.75, + "completions/mean_terminated_length": 24.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 48.851851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09979604929685593, + "kl": 0.10168707370758057, + "learning_rate": 2.4543333333333334e-06, + "loss": 0.0051, + "num_tokens": 784898.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 48.870370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1377181112766266, + "kl": 0.015214312821626663, + "learning_rate": 2.4539999999999997e-06, + "loss": 0.0008, + "num_tokens": 785156.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 48.888888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017956532537937164, + "kl": 0.04019933193922043, + "learning_rate": 2.453666666666667e-06, + "loss": 0.002, + "num_tokens": 785561.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 2640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 48.907407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8203216791152954, + "kl": 0.13380496203899384, + "learning_rate": 2.4533333333333337e-06, + "loss": 0.0056, + "num_tokens": 785877.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 48.925925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04856766760349274, + "kl": 0.12611490115523338, + "learning_rate": 2.453e-06, + "loss": 0.0063, + "num_tokens": 786187.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 48.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00923575833439827, + "kl": 0.00044141808757558465, + "learning_rate": 2.452666666666667e-06, + "loss": 0.0, + "num_tokens": 786504.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 48.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02838926948606968, + "kl": 0.005278926342725754, + "learning_rate": 2.452333333333333e-06, + "loss": 0.0003, + "num_tokens": 786808.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 48.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07370824366807938, + "kl": 0.01963486336171627, + "learning_rate": 2.452e-06, + "loss": 0.001, + "num_tokens": 787128.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 49.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08243677020072937, + "kl": 0.006912033539265394, + "learning_rate": 2.4516666666666668e-06, + "loss": 0.0004, + "num_tokens": 787392.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 49.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01931563951075077, + "kl": 0.0012164450890850276, + "learning_rate": 2.4513333333333336e-06, + "loss": 0.0001, + "num_tokens": 787706.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 30.25, + "completions/mean_terminated_length": 30.25, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 49.03703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0468934774398804, + "kl": 0.06819241680204868, + "learning_rate": 2.451e-06, + "loss": -0.0331, + "num_tokens": 788107.0, + "reward": 1.75, + "reward_std": 1.443375587463379, + "rewards/reward_combined/mean": 1.75, + "rewards/reward_combined/std": 1.4433757066726685, + "step": 2648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 49.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.504814088344574, + "kl": 0.06039337324909866, + "learning_rate": 2.4506666666666667e-06, + "loss": 0.003, + "num_tokens": 788391.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 49.074074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07930750399827957, + "kl": 0.010783094447106123, + "learning_rate": 2.4503333333333335e-06, + "loss": 0.0006, + "num_tokens": 788649.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 49.092592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02539624646306038, + "kl": 0.0014381707296706736, + "learning_rate": 2.45e-06, + "loss": 0.0001, + "num_tokens": 788917.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 49.111111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008240375318564475, + "kl": 0.001209248322993517, + "learning_rate": 2.449666666666667e-06, + "loss": 0.0001, + "num_tokens": 789197.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 49.129629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.39711856842041016, + "kl": 0.02422859240323305, + "learning_rate": 2.4493333333333334e-06, + "loss": 0.0012, + "num_tokens": 789532.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 49.148148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1310444474220276, + "kl": 0.013972847256809473, + "learning_rate": 2.449e-06, + "loss": 0.0006, + "num_tokens": 789824.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 49.166666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00037588595296256244, + "kl": 2.4981796741485596e-05, + "learning_rate": 2.4486666666666665e-06, + "loss": 0.0, + "num_tokens": 790044.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 2655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 49.18518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07478267699480057, + "kl": 0.013302026316523552, + "learning_rate": 2.4483333333333333e-06, + "loss": 0.0007, + "num_tokens": 790368.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 49.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013030369766056538, + "kl": 0.0034678855445235968, + "learning_rate": 2.448e-06, + "loss": 0.0002, + "num_tokens": 790656.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2657 + }, + { + "clip_ratio/high_max": 0.014285714365541935, + "clip_ratio/high_mean": 0.014285714365541935, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.014285714365541935, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 49.22222222222222, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.099160194396973, + "kl": 0.04462042637169361, + "learning_rate": 2.447666666666667e-06, + "loss": 0.0722, + "num_tokens": 790963.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 2658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.25, + "completions/mean_terminated_length": 6.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 49.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10959002375602722, + "kl": 0.0050025584059767425, + "learning_rate": 2.4473333333333337e-06, + "loss": 0.0003, + "num_tokens": 791196.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 49.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008650427684187889, + "kl": 0.0024437233805656433, + "learning_rate": 2.447e-06, + "loss": 0.0001, + "num_tokens": 791412.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 49.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02097531035542488, + "kl": 0.00032559634564677253, + "learning_rate": 2.446666666666667e-06, + "loss": 0.0, + "num_tokens": 791668.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 49.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12095050513744354, + "kl": 0.019461162388324738, + "learning_rate": 2.446333333333333e-06, + "loss": 0.001, + "num_tokens": 791967.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 49.31481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03256397321820259, + "kl": 0.002429179206956178, + "learning_rate": 2.446e-06, + "loss": 0.0001, + "num_tokens": 792263.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 49.333333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036686867475509644, + "kl": 0.007383710239082575, + "learning_rate": 2.4456666666666667e-06, + "loss": 0.0004, + "num_tokens": 792556.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 49.351851851851855, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2952873706817627, + "kl": 0.5442184414714575, + "learning_rate": 2.4453333333333335e-06, + "loss": -0.1153, + "num_tokens": 792886.0, + "reward": 4.5, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 4.041451930999756, + "step": 2665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 49.370370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0037277434021234512, + "kl": 0.0004651211202144623, + "learning_rate": 2.445e-06, + "loss": 0.0, + "num_tokens": 793146.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 49.388888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008202599361538887, + "kl": 0.0007944583776406944, + "learning_rate": 2.4446666666666667e-06, + "loss": 0.0, + "num_tokens": 793366.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 49.407407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034822553396224976, + "kl": 0.003939538088161498, + "learning_rate": 2.4443333333333334e-06, + "loss": 0.0002, + "num_tokens": 793700.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 49.425925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08850719779729843, + "kl": 0.009072621120139956, + "learning_rate": 2.444e-06, + "loss": 0.0005, + "num_tokens": 793950.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 49.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10264670848846436, + "kl": 0.0181496012955904, + "learning_rate": 2.443666666666667e-06, + "loss": 0.0009, + "num_tokens": 794220.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 49.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010163417086005211, + "kl": 0.0017835497856140137, + "learning_rate": 2.4433333333333334e-06, + "loss": 0.0001, + "num_tokens": 794456.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 35.0, + "completions/mean_terminated_length": 35.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 49.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12668950855731964, + "kl": 0.035411637276411057, + "learning_rate": 2.443e-06, + "loss": 0.0017, + "num_tokens": 794820.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.5, + "completions/mean_terminated_length": 4.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 49.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04296569526195526, + "kl": 0.0030661100754514337, + "learning_rate": 2.4426666666666665e-06, + "loss": 0.0001, + "num_tokens": 795034.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 49.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01004098355770111, + "kl": 0.00011940300464630127, + "learning_rate": 2.4423333333333333e-06, + "loss": 0.0, + "num_tokens": 795246.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 30.25, + "completions/mean_terminated_length": 30.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 49.53703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.739889621734619, + "kl": 0.020626836456358433, + "learning_rate": 2.442e-06, + "loss": 0.1356, + "num_tokens": 795595.0, + "reward": 6.125, + "reward_std": 3.75, + "rewards/reward_combined/mean": 6.125, + "rewards/reward_combined/std": 3.75, + "step": 2675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 49.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18879079818725586, + "kl": 0.03710257029160857, + "learning_rate": 2.441666666666667e-06, + "loss": 0.0018, + "num_tokens": 795884.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 49.574074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03499231114983559, + "kl": 0.012018134817481041, + "learning_rate": 2.4413333333333336e-06, + "loss": 0.0006, + "num_tokens": 796145.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 49.592592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05656729266047478, + "kl": 0.012749699875712395, + "learning_rate": 2.441e-06, + "loss": 0.0006, + "num_tokens": 796434.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.010416666977107525, + "clip_ratio/low_min": 0.010416666977107525, + "clip_ratio/region_mean": 0.010416666977107525, + "completion_length": 27.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 27.0, + "completions/mean_terminated_length": 27.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 49.611111111111114, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.339295387268066, + "kl": 0.02863692305982113, + "learning_rate": 2.440666666666667e-06, + "loss": -0.029, + "num_tokens": 796758.0, + "reward": 2.125, + "reward_std": 1.6007810831069946, + "rewards/reward_combined/mean": 2.125, + "rewards/reward_combined/std": 1.6007810831069946, + "step": 2679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 28.0, + "completions/mean_terminated_length": 28.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 49.629629629629626, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1148946285247803, + "kl": 0.1171901561319828, + "learning_rate": 2.440333333333333e-06, + "loss": -0.0404, + "num_tokens": 797106.0, + "reward": 6.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 6.5, + "rewards/reward_combined/std": 2.0, + "step": 2680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 49.648148148148145, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1947810649871826, + "kl": 0.078176059294492, + "learning_rate": 2.44e-06, + "loss": 0.0036, + "num_tokens": 797392.0, + "reward": 6.75, + "reward_std": 2.1794495582580566, + "rewards/reward_combined/mean": 6.75, + "rewards/reward_combined/std": 2.1794495582580566, + "step": 2681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 49.666666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19639906287193298, + "kl": 0.02894734777510166, + "learning_rate": 2.4396666666666667e-06, + "loss": 0.0016, + "num_tokens": 797672.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 29.0, + "completions/mean_terminated_length": 29.0, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 49.68518518518518, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8781650066375732, + "kl": 0.05732029862701893, + "learning_rate": 2.4393333333333335e-06, + "loss": -0.1077, + "num_tokens": 798016.0, + "reward": 7.625, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.625, + "rewards/reward_combined/std": 0.25, + "step": 2683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 49.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08665555715560913, + "kl": 0.036981672048568726, + "learning_rate": 2.439e-06, + "loss": 0.0018, + "num_tokens": 798356.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 49.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037065740674734116, + "kl": 0.001828221109462902, + "learning_rate": 2.4386666666666666e-06, + "loss": 0.0001, + "num_tokens": 798635.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 49.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030324524268507957, + "kl": 0.2642036974430084, + "learning_rate": 2.4383333333333334e-06, + "loss": 0.0132, + "num_tokens": 798940.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 49.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0021105341147631407, + "kl": 0.0008045106951612979, + "learning_rate": 2.438e-06, + "loss": 0.0, + "num_tokens": 799200.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 49.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028025034815073013, + "kl": 0.0010068500705529004, + "learning_rate": 2.437666666666667e-06, + "loss": 0.0001, + "num_tokens": 799468.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 49.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053984951227903366, + "kl": 0.14922355860471725, + "learning_rate": 2.4373333333333333e-06, + "loss": 0.0075, + "num_tokens": 799780.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 49.81481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022466666996479034, + "kl": 0.0025486857630312443, + "learning_rate": 2.437e-06, + "loss": 0.0001, + "num_tokens": 800040.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 49.833333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06602063775062561, + "kl": 0.0004631355404853821, + "learning_rate": 2.4366666666666665e-06, + "loss": 0.0, + "num_tokens": 800252.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 49.851851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5515338778495789, + "kl": 0.05331879248842597, + "learning_rate": 2.4363333333333333e-06, + "loss": 0.003, + "num_tokens": 800531.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 40.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 40.75, + "completions/mean_terminated_length": 40.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 49.870370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0370035246014595, + "kl": 0.015739089343696833, + "learning_rate": 2.436e-06, + "loss": 0.0007, + "num_tokens": 800914.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 49.888888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07922983169555664, + "kl": 0.027185593266040087, + "learning_rate": 2.435666666666667e-06, + "loss": 0.0014, + "num_tokens": 801202.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 49.907407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11076068878173828, + "kl": 0.01239091157913208, + "learning_rate": 2.4353333333333336e-06, + "loss": 0.0006, + "num_tokens": 801470.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 49.925925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06370322406291962, + "kl": 0.08850816637277603, + "learning_rate": 2.435e-06, + "loss": 0.0044, + "num_tokens": 801834.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 49.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08588910847902298, + "kl": 0.005389301746618003, + "learning_rate": 2.4346666666666668e-06, + "loss": 0.0003, + "num_tokens": 802155.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 49.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09936283528804779, + "kl": 0.013003773987293243, + "learning_rate": 2.434333333333333e-06, + "loss": 0.0007, + "num_tokens": 802467.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 49.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1194983720779419, + "kl": 0.014567551203072071, + "learning_rate": 2.434e-06, + "loss": 0.0008, + "num_tokens": 802771.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 29.0, + "completions/mean_terminated_length": 29.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 50.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7284114360809326, + "kl": 0.041163988411426544, + "learning_rate": 2.4336666666666667e-06, + "loss": 0.0668, + "num_tokens": 803107.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 2700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 50.01851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.619093894958496, + "kl": 0.047781015920918435, + "learning_rate": 2.4333333333333335e-06, + "loss": 0.0001, + "num_tokens": 803327.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 2701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 50.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01032946351915598, + "kl": 0.26712600886821747, + "learning_rate": 2.4330000000000003e-06, + "loss": 0.0134, + "num_tokens": 803631.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 50.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04903581738471985, + "kl": 0.0033776217605918646, + "learning_rate": 2.4326666666666666e-06, + "loss": 0.0002, + "num_tokens": 803958.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 50.074074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024718619883060455, + "kl": 0.003179234452545643, + "learning_rate": 2.4323333333333334e-06, + "loss": 0.0002, + "num_tokens": 804270.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 50.092592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06741975992918015, + "kl": 0.01074890187010169, + "learning_rate": 2.432e-06, + "loss": 0.0005, + "num_tokens": 804576.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 50.111111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07277852296829224, + "kl": 0.01764109404757619, + "learning_rate": 2.431666666666667e-06, + "loss": 0.0009, + "num_tokens": 804850.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 50.129629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07444789260625839, + "kl": 0.008178258314728737, + "learning_rate": 2.4313333333333333e-06, + "loss": 0.0004, + "num_tokens": 805167.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 50.148148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008052741177380085, + "kl": 0.0012273192405700684, + "learning_rate": 2.431e-06, + "loss": 0.0001, + "num_tokens": 805447.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 7.0, + "completions/mean_terminated_length": 7.0, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 50.166666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04898441210389137, + "kl": 0.0042346930131316185, + "learning_rate": 2.4306666666666665e-06, + "loss": 0.0002, + "num_tokens": 805683.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 50.18518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002621825260575861, + "kl": 8.128583431243896e-06, + "learning_rate": 2.4303333333333332e-06, + "loss": 0.0, + "num_tokens": 805903.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 2710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 24.25, + "completions/mean_terminated_length": 24.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 50.2037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.677748680114746, + "kl": 0.01190832769498229, + "learning_rate": 2.43e-06, + "loss": -0.036, + "num_tokens": 806228.0, + "reward": 6.75, + "reward_std": 2.5, + "rewards/reward_combined/mean": 6.75, + "rewards/reward_combined/std": 2.5, + "step": 2711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 35.75, + "completions/mean_terminated_length": 35.75, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 50.22222222222222, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1671454906463623, + "kl": 0.030420560389757156, + "learning_rate": 2.429666666666667e-06, + "loss": 0.037, + "num_tokens": 806595.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 2712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 50.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009926141239702702, + "kl": 0.002172514796257019, + "learning_rate": 2.4293333333333336e-06, + "loss": 0.0001, + "num_tokens": 806811.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 50.25925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.553558588027954, + "kl": 0.5103753441944718, + "learning_rate": 2.429e-06, + "loss": 0.0613, + "num_tokens": 807072.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 2714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 50.27777777777778, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.161747455596924, + "kl": 0.07729346863925457, + "learning_rate": 2.4286666666666667e-06, + "loss": -0.1388, + "num_tokens": 807374.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 2715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 50.2962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.65897798538208, + "kl": 0.02129838801920414, + "learning_rate": 2.428333333333333e-06, + "loss": -0.0117, + "num_tokens": 807707.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 2716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 50.31481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016636377200484276, + "kl": 0.0006402172148227692, + "learning_rate": 2.4280000000000003e-06, + "loss": 0.0, + "num_tokens": 807967.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 50.333333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2735231816768646, + "kl": 0.057274749502539635, + "learning_rate": 2.4276666666666667e-06, + "loss": 0.0029, + "num_tokens": 808283.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 50.351851851851855, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.809070587158203, + "kl": 0.03625241667032242, + "learning_rate": 2.4273333333333334e-06, + "loss": 0.0365, + "num_tokens": 808584.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 2719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 50.370370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08341844379901886, + "kl": 0.008286748547106981, + "learning_rate": 2.4270000000000002e-06, + "loss": 0.0004, + "num_tokens": 808852.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 50.388888888888886, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.658259630203247, + "kl": 0.052839044481515884, + "learning_rate": 2.4266666666666666e-06, + "loss": -0.179, + "num_tokens": 809204.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 2721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 50.407407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11819658428430557, + "kl": 0.015196267049759626, + "learning_rate": 2.4263333333333334e-06, + "loss": 0.0008, + "num_tokens": 809533.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 29.5, + "completions/mean_terminated_length": 29.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 50.425925925925924, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.77386736869812, + "kl": 0.0791231095790863, + "learning_rate": 2.426e-06, + "loss": 0.0951, + "num_tokens": 809887.0, + "reward": 4.625, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 2723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 50.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00891206320375204, + "kl": 0.0021475031971931458, + "learning_rate": 2.425666666666667e-06, + "loss": 0.0001, + "num_tokens": 810123.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 50.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051420990377664566, + "kl": 0.011426992248743773, + "learning_rate": 2.4253333333333333e-06, + "loss": 0.0006, + "num_tokens": 810463.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 50.48148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4433037042617798, + "kl": 0.012402377324178815, + "learning_rate": 2.425e-06, + "loss": -0.0009, + "num_tokens": 810751.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 2726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 50.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020202970132231712, + "kl": 0.005042759468778968, + "learning_rate": 2.4246666666666664e-06, + "loss": 0.0003, + "num_tokens": 811021.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 50.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048094965517520905, + "kl": 0.011728422716259956, + "learning_rate": 2.4243333333333332e-06, + "loss": 0.0006, + "num_tokens": 811352.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2728 + }, + { + "clip_ratio/high_max": 0.0017301038606092334, + "clip_ratio/high_mean": 0.0017301038606092334, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017301038606092334, + "completion_length": 86.75, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 86.75, + "completions/mean_terminated_length": 30.33333396911621, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 50.53703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9604570865631104, + "kl": 0.025076637975871563, + "learning_rate": 2.4240000000000004e-06, + "loss": 0.387, + "num_tokens": 811915.0, + "reward": 2.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 2.25, + "step": 2729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 34.75, + "completions/mean_terminated_length": 34.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 50.55555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6494704484939575, + "kl": 0.03713542781770229, + "learning_rate": 2.4236666666666668e-06, + "loss": -0.0793, + "num_tokens": 812334.0, + "reward": 2.375, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 1.25, + "step": 2730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 50.574074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06151675805449486, + "kl": 0.000841870903968811, + "learning_rate": 2.4233333333333336e-06, + "loss": 0.0, + "num_tokens": 812546.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 50.592592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08187728375196457, + "kl": 0.007671724772080779, + "learning_rate": 2.423e-06, + "loss": 0.0004, + "num_tokens": 812830.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 50.611111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05948270112276077, + "kl": 0.10078465938568115, + "learning_rate": 2.4226666666666667e-06, + "loss": 0.005, + "num_tokens": 813198.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 50.629629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08605901896953583, + "kl": 0.006832719314843416, + "learning_rate": 2.4223333333333335e-06, + "loss": 0.0003, + "num_tokens": 813466.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 50.648148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09176906198263168, + "kl": 0.006867297692224383, + "learning_rate": 2.4220000000000003e-06, + "loss": 0.0004, + "num_tokens": 813728.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 50.666666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0165004413574934, + "kl": 0.000317606347380206, + "learning_rate": 2.4216666666666666e-06, + "loss": 0.0, + "num_tokens": 813984.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 50.68518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07851296663284302, + "kl": 0.013246928807348013, + "learning_rate": 2.4213333333333334e-06, + "loss": 0.0007, + "num_tokens": 814245.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 36.5, + "completions/mean_terminated_length": 36.5, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 50.7037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.245790958404541, + "kl": 0.09969478845596313, + "learning_rate": 2.421e-06, + "loss": 0.0353, + "num_tokens": 814619.0, + "reward": 5.0, + "reward_std": 3.34165620803833, + "rewards/reward_combined/mean": 5.0, + "rewards/reward_combined/std": 3.34165620803833, + "step": 2738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 50.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02559671737253666, + "kl": 0.001442710228729993, + "learning_rate": 2.4206666666666666e-06, + "loss": 0.0001, + "num_tokens": 814899.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 50.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.95418381690979, + "kl": 0.036475375294685364, + "learning_rate": 2.4203333333333333e-06, + "loss": 0.0024, + "num_tokens": 815109.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 50.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010515622794628143, + "kl": 0.008626106195151806, + "learning_rate": 2.42e-06, + "loss": 0.0004, + "num_tokens": 815381.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 50.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006234433501958847, + "kl": 0.0008897421357687563, + "learning_rate": 2.419666666666667e-06, + "loss": 0.0, + "num_tokens": 815643.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 50.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06533730030059814, + "kl": 0.006466299062594771, + "learning_rate": 2.4193333333333333e-06, + "loss": 0.0003, + "num_tokens": 815917.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 50.81481481481482, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.251893997192383, + "kl": 0.025556170847266912, + "learning_rate": 2.419e-06, + "loss": -0.1101, + "num_tokens": 816203.0, + "reward": 7.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.875, + "rewards/reward_combined/std": 0.25, + "step": 2744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 50.833333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00700110150501132, + "kl": 0.0005770400166511536, + "learning_rate": 2.4186666666666664e-06, + "loss": 0.0, + "num_tokens": 816447.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 50.851851851851855, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.8227620124816895, + "kl": 0.2471969798207283, + "learning_rate": 2.418333333333333e-06, + "loss": -0.0321, + "num_tokens": 816757.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 2746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 50.870370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.30015987157821655, + "kl": 0.02381989953573793, + "learning_rate": 2.4180000000000004e-06, + "loss": 0.0013, + "num_tokens": 817055.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 50.888888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05472929775714874, + "kl": 0.006903106113895774, + "learning_rate": 2.4176666666666668e-06, + "loss": 0.0003, + "num_tokens": 817348.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 50.907407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07999484241008759, + "kl": 0.015777053777128458, + "learning_rate": 2.4173333333333335e-06, + "loss": 0.0008, + "num_tokens": 817634.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 48.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 48.5, + "completions/mean_terminated_length": 48.5, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 50.925925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0765211284160614, + "kl": 0.015375125221908092, + "learning_rate": 2.417e-06, + "loss": 0.0008, + "num_tokens": 818048.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 50.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14390313625335693, + "kl": 0.0063751935958862305, + "learning_rate": 2.4166666666666667e-06, + "loss": 0.0005, + "num_tokens": 818264.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 50.96296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.940622806549072, + "kl": 0.03030684869736433, + "learning_rate": 2.4163333333333335e-06, + "loss": 0.0357, + "num_tokens": 818575.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 2752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 50.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018179886043071747, + "kl": 0.0007749234209768474, + "learning_rate": 2.4160000000000002e-06, + "loss": 0.0, + "num_tokens": 818843.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 51.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08896339684724808, + "kl": 0.010296330321580172, + "learning_rate": 2.4156666666666666e-06, + "loss": 0.0005, + "num_tokens": 819141.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 51.01851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.347787857055664, + "kl": 0.05637666955590248, + "learning_rate": 2.4153333333333334e-06, + "loss": 0.0105, + "num_tokens": 819460.0, + "reward": 5.75, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 2.598076105117798, + "step": 2755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 51.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013472180580720305, + "kl": 0.0012846568133682013, + "learning_rate": 2.415e-06, + "loss": 0.0001, + "num_tokens": 819740.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 51.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03685910999774933, + "kl": 0.006507994374260306, + "learning_rate": 2.4146666666666665e-06, + "loss": 0.0003, + "num_tokens": 820042.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 51.074074074074076, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.2996296882629395, + "kl": 0.018799642100930214, + "learning_rate": 2.4143333333333333e-06, + "loss": 0.2577, + "num_tokens": 820340.0, + "reward": 6.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 6.5, + "rewards/reward_combined/std": 2.0, + "step": 2758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 51.092592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.42697465419769287, + "kl": 0.04794200509786606, + "learning_rate": 2.414e-06, + "loss": 0.0036, + "num_tokens": 820552.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 51.111111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03810492530465126, + "kl": 0.010631228797137737, + "learning_rate": 2.413666666666667e-06, + "loss": 0.0005, + "num_tokens": 820878.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 51.129629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.33721089363098145, + "kl": 0.0593524519354105, + "learning_rate": 2.4133333333333332e-06, + "loss": 0.0043, + "num_tokens": 821183.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 51.148148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04073914885520935, + "kl": 0.0020348261459730566, + "learning_rate": 2.413e-06, + "loss": 0.0001, + "num_tokens": 821455.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 51.166666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00807228684425354, + "kl": 0.26762712001800537, + "learning_rate": 2.4126666666666664e-06, + "loss": 0.0134, + "num_tokens": 821759.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 51.18518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01383829303085804, + "kl": 0.0006056129932403564, + "learning_rate": 2.4123333333333336e-06, + "loss": 0.0, + "num_tokens": 822019.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 51.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002749810926616192, + "kl": 0.016368752345442772, + "learning_rate": 2.4120000000000004e-06, + "loss": 0.0008, + "num_tokens": 822279.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 51.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10478509962558746, + "kl": 0.007327968487516046, + "learning_rate": 2.4116666666666667e-06, + "loss": 0.0004, + "num_tokens": 822606.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 51.24074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.27007794380188, + "kl": 0.07198414951562881, + "learning_rate": 2.4113333333333335e-06, + "loss": 0.0572, + "num_tokens": 822899.0, + "reward": 4.0, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 4.041451930999756, + "step": 2767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 51.25925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.100656032562256, + "kl": 0.03164546750485897, + "learning_rate": 2.411e-06, + "loss": 0.1144, + "num_tokens": 823180.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 2768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 51.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044763192534446716, + "kl": 0.0057826959528028965, + "learning_rate": 2.4106666666666667e-06, + "loss": 0.0003, + "num_tokens": 823470.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 31.75, + "completions/mean_terminated_length": 31.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 51.2962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.564068078994751, + "kl": 0.025499539449810982, + "learning_rate": 2.4103333333333334e-06, + "loss": -0.1491, + "num_tokens": 823821.0, + "reward": 5.25, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 2.598076105117798, + "step": 2770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 51.31481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026910219341516495, + "kl": 0.005641356110572815, + "learning_rate": 2.4100000000000002e-06, + "loss": 0.0003, + "num_tokens": 824109.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 27.25, + "completions/mean_terminated_length": 27.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 51.333333333333336, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.387589693069458, + "kl": 0.10334932431578636, + "learning_rate": 2.4096666666666666e-06, + "loss": 0.1159, + "num_tokens": 824454.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 2772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 51.351851851851855, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.524815559387207, + "kl": 0.09898410178720951, + "learning_rate": 2.4093333333333334e-06, + "loss": -0.0125, + "num_tokens": 824731.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 2773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 51.370370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06173790991306305, + "kl": 0.001071445643901825, + "learning_rate": 2.409e-06, + "loss": 0.0001, + "num_tokens": 824944.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 51.388888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05020973086357117, + "kl": 0.0029343462083488703, + "learning_rate": 2.4086666666666665e-06, + "loss": 0.0001, + "num_tokens": 825187.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 29.0, + "completions/mean_terminated_length": 29.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 51.407407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13685880601406097, + "kl": 0.024513863027095795, + "learning_rate": 2.4083333333333337e-06, + "loss": 0.0012, + "num_tokens": 825531.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 51.425925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03583168238401413, + "kl": 0.0018282131059095263, + "learning_rate": 2.408e-06, + "loss": 0.0001, + "num_tokens": 825828.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 51.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05771275609731674, + "kl": 0.004931454313918948, + "learning_rate": 2.407666666666667e-06, + "loss": 0.0003, + "num_tokens": 826144.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 51.46296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.123723983764648, + "kl": 0.09181689098477364, + "learning_rate": 2.407333333333333e-06, + "loss": 0.0424, + "num_tokens": 826483.0, + "reward": 5.75, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 4.5, + "step": 2779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 51.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010458242148160934, + "kl": 0.0023326099617406726, + "learning_rate": 2.407e-06, + "loss": 0.0001, + "num_tokens": 826703.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 51.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01242623571306467, + "kl": 0.0002751588763203472, + "learning_rate": 2.4066666666666668e-06, + "loss": 0.0, + "num_tokens": 826959.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 51.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0755714550614357, + "kl": 0.02212864439934492, + "learning_rate": 2.4063333333333336e-06, + "loss": 0.0011, + "num_tokens": 827249.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 51.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.0595531463623047, + "kl": 0.2190406396985054, + "learning_rate": 2.4060000000000003e-06, + "loss": 0.011, + "num_tokens": 827563.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 51.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09541331231594086, + "kl": 0.011347746010869741, + "learning_rate": 2.4056666666666667e-06, + "loss": 0.0006, + "num_tokens": 827896.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 51.574074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0031813124660402536, + "kl": 0.001807287335395813, + "learning_rate": 2.4053333333333335e-06, + "loss": 0.0001, + "num_tokens": 828208.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 51.592592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009273199364542961, + "kl": 0.0021372660994529724, + "learning_rate": 2.405e-06, + "loss": 0.0001, + "num_tokens": 828444.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 51.611111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017374370247125626, + "kl": 0.0016625404241494834, + "learning_rate": 2.4046666666666666e-06, + "loss": 0.0001, + "num_tokens": 828704.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 51.629629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017382580786943436, + "kl": 0.006710775662213564, + "learning_rate": 2.4043333333333334e-06, + "loss": 0.0004, + "num_tokens": 829008.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.75, + "completions/mean_terminated_length": 10.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 51.648148148148145, + "frac_reward_zero_std": 0.0, + "grad_norm": 11.108415603637695, + "kl": 0.06864787393715233, + "learning_rate": 2.404e-06, + "loss": 0.0355, + "num_tokens": 829279.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 2789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 51.666666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13853999972343445, + "kl": 0.0072122784331440926, + "learning_rate": 2.4036666666666666e-06, + "loss": 0.0004, + "num_tokens": 829506.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 51.68518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055810511112213135, + "kl": 0.15573750436306, + "learning_rate": 2.4033333333333333e-06, + "loss": 0.0078, + "num_tokens": 829816.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 51.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1302904188632965, + "kl": 0.007078180671669543, + "learning_rate": 2.403e-06, + "loss": 0.0004, + "num_tokens": 830032.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 32.5, + "completions/mean_terminated_length": 32.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 51.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14294971525669098, + "kl": 0.024903587996959686, + "learning_rate": 2.4026666666666665e-06, + "loss": 0.0012, + "num_tokens": 830382.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 51.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003949082165490836, + "kl": 2.4043023586273193e-05, + "learning_rate": 2.4023333333333337e-06, + "loss": 0.0, + "num_tokens": 830602.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 2794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 51.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015595119446516037, + "kl": 0.0028768021147698164, + "learning_rate": 2.402e-06, + "loss": 0.0001, + "num_tokens": 830886.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 51.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057516466826200485, + "kl": 0.09041155502200127, + "learning_rate": 2.401666666666667e-06, + "loss": 0.0045, + "num_tokens": 831250.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 51.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13289222121238708, + "kl": 0.03229084413032979, + "learning_rate": 2.401333333333333e-06, + "loss": 0.0016, + "num_tokens": 831539.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 51.81481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006144684739410877, + "kl": 0.0010871917475014925, + "learning_rate": 2.401e-06, + "loss": 0.0001, + "num_tokens": 831799.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 57.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 57.0, + "completions/mean_terminated_length": 57.0, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 51.833333333333336, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.723008632659912, + "kl": 0.03888143226504326, + "learning_rate": 2.4006666666666667e-06, + "loss": 0.297, + "num_tokens": 832243.0, + "reward": 1.875, + "reward_std": 3.25, + "rewards/reward_combined/mean": 1.875, + "rewards/reward_combined/std": 3.25, + "step": 2799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 51.851851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03026449866592884, + "kl": 0.0013829807576257735, + "learning_rate": 2.4003333333333335e-06, + "loss": 0.0001, + "num_tokens": 832513.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 51.870370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13052216172218323, + "kl": 0.03593877051025629, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.0019, + "num_tokens": 832833.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 27.75, + "completions/mean_terminated_length": 27.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 51.888888888888886, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.955604553222656, + "kl": 0.06896631233394146, + "learning_rate": 2.3996666666666667e-06, + "loss": 0.1753, + "num_tokens": 833196.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 2802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 51.907407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029458703473210335, + "kl": 0.006023412570357323, + "learning_rate": 2.3993333333333335e-06, + "loss": 0.0003, + "num_tokens": 833466.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 51.925925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8770244121551514, + "kl": 0.10628250613808632, + "learning_rate": 2.399e-06, + "loss": 0.0066, + "num_tokens": 833708.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 28.5, + "completions/mean_terminated_length": 28.5, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 51.94444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.715651035308838, + "kl": 0.10342739894986153, + "learning_rate": 2.3986666666666666e-06, + "loss": 0.039, + "num_tokens": 834050.0, + "reward": 4.0, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 4.041451930999756, + "step": 2805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 51.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06230156868696213, + "kl": 0.006974156480282545, + "learning_rate": 2.3983333333333334e-06, + "loss": 0.0004, + "num_tokens": 834380.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.75, + "completions/mean_terminated_length": 31.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 51.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06352169811725616, + "kl": 0.03610933106392622, + "learning_rate": 2.398e-06, + "loss": 0.0018, + "num_tokens": 834787.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 2807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 52.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14158853888511658, + "kl": 0.009810572722926736, + "learning_rate": 2.3976666666666665e-06, + "loss": 0.0005, + "num_tokens": 835054.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 52.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07737588882446289, + "kl": 0.022331008221954107, + "learning_rate": 2.3973333333333333e-06, + "loss": 0.0011, + "num_tokens": 835348.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 52.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0573759600520134, + "kl": 0.005496953381225467, + "learning_rate": 2.397e-06, + "loss": 0.0003, + "num_tokens": 835641.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 52.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11601516604423523, + "kl": 0.02744139451533556, + "learning_rate": 2.396666666666667e-06, + "loss": 0.0014, + "num_tokens": 835944.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 52.074074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008734889328479767, + "kl": 0.0016702950233593583, + "learning_rate": 2.3963333333333337e-06, + "loss": 0.0001, + "num_tokens": 836164.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 52.092592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006803820841014385, + "kl": 0.0005280971527099609, + "learning_rate": 2.396e-06, + "loss": 0.0, + "num_tokens": 836424.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 52.111111111111114, + "frac_reward_zero_std": 0.0, + "grad_norm": 17.146289825439453, + "kl": 0.012594989500939846, + "learning_rate": 2.395666666666667e-06, + "loss": 0.291, + "num_tokens": 836670.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 2814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 52.129629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07020723074674606, + "kl": 0.002377644181251526, + "learning_rate": 2.395333333333333e-06, + "loss": 0.0001, + "num_tokens": 836914.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 52.148148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056235987693071365, + "kl": 0.005837368196807802, + "learning_rate": 2.395e-06, + "loss": 0.0003, + "num_tokens": 837245.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 52.166666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016951628029346466, + "kl": 0.004820444737561047, + "learning_rate": 2.3946666666666667e-06, + "loss": 0.0002, + "num_tokens": 837533.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 52.18518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16498813033103943, + "kl": 0.05341300368309021, + "learning_rate": 2.3943333333333335e-06, + "loss": 0.0027, + "num_tokens": 837834.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 52.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012449953937903047, + "kl": 0.001224594481755048, + "learning_rate": 2.3940000000000003e-06, + "loss": 0.0001, + "num_tokens": 838114.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 34.75, + "completions/mean_terminated_length": 34.75, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 52.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09871586412191391, + "kl": 0.03253638092428446, + "learning_rate": 2.3936666666666666e-06, + "loss": 0.0016, + "num_tokens": 838477.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 52.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12448083609342575, + "kl": 0.002673305571079254, + "learning_rate": 2.3933333333333334e-06, + "loss": 0.0001, + "num_tokens": 838689.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 52.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009484909474849701, + "kl": 0.0004921044746879488, + "learning_rate": 2.393e-06, + "loss": 0.0, + "num_tokens": 838924.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 52.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005183587782084942, + "kl": 0.0023222336312755942, + "learning_rate": 2.3926666666666666e-06, + "loss": 0.0001, + "num_tokens": 839208.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 52.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015094439499080181, + "kl": 0.0008409619040321559, + "learning_rate": 2.3923333333333334e-06, + "loss": 0.0, + "num_tokens": 839480.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 52.31481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0029992111958563328, + "kl": 0.0016938485205173492, + "learning_rate": 2.392e-06, + "loss": 0.0001, + "num_tokens": 839792.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 52.333333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028094839304685593, + "kl": 0.0018907834310084581, + "learning_rate": 2.391666666666667e-06, + "loss": 0.0001, + "num_tokens": 840066.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 52.351851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1068010926246643, + "kl": 0.03681113198399544, + "learning_rate": 2.3913333333333333e-06, + "loss": 0.0018, + "num_tokens": 840421.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 52.370370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04650650545954704, + "kl": 0.03493187949061394, + "learning_rate": 2.391e-06, + "loss": 0.0017, + "num_tokens": 840737.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 52.388888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06854855269193649, + "kl": 0.017199012450873852, + "learning_rate": 2.390666666666667e-06, + "loss": 0.0008, + "num_tokens": 841069.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 52.407407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017151908949017525, + "kl": 0.26613669097423553, + "learning_rate": 2.3903333333333336e-06, + "loss": 0.0133, + "num_tokens": 841373.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 52.425925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0067215291783213615, + "kl": 0.0013058037147857249, + "learning_rate": 2.39e-06, + "loss": 0.0001, + "num_tokens": 841633.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 52.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06889953464269638, + "kl": 0.01987474039196968, + "learning_rate": 2.3896666666666668e-06, + "loss": 0.001, + "num_tokens": 841915.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 52.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028096316382288933, + "kl": 0.002080070087686181, + "learning_rate": 2.389333333333333e-06, + "loss": 0.0001, + "num_tokens": 842183.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 52.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09361562132835388, + "kl": 0.0048256367444992065, + "learning_rate": 2.389e-06, + "loss": 0.0002, + "num_tokens": 842395.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 52.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009083788841962814, + "kl": 0.0048482418060302734, + "learning_rate": 2.3886666666666667e-06, + "loss": 0.0002, + "num_tokens": 842663.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 52.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07530864328145981, + "kl": 0.1553303226828575, + "learning_rate": 2.3883333333333335e-06, + "loss": 0.0078, + "num_tokens": 842974.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 52.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027725214138627052, + "kl": 0.004206315497867763, + "learning_rate": 2.3880000000000003e-06, + "loss": 0.0002, + "num_tokens": 843276.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 52.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0725192129611969, + "kl": 0.0014049112796783447, + "learning_rate": 2.3876666666666666e-06, + "loss": 0.0001, + "num_tokens": 843532.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 52.574074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011141877621412277, + "kl": 0.0006590724806301296, + "learning_rate": 2.3873333333333334e-06, + "loss": 0.0, + "num_tokens": 843852.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 52.592592592592595, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5030946731567383, + "kl": 0.07586564496159554, + "learning_rate": 2.3869999999999998e-06, + "loss": 0.0424, + "num_tokens": 844188.0, + "reward": 6.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 6.5, + "rewards/reward_combined/std": 2.0, + "step": 2840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 52.611111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03774819150567055, + "kl": 0.007036800729110837, + "learning_rate": 2.386666666666667e-06, + "loss": 0.0003, + "num_tokens": 844479.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 52.629629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01741122081875801, + "kl": 0.001554942165967077, + "learning_rate": 2.3863333333333333e-06, + "loss": 0.0001, + "num_tokens": 844739.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 52.648148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0031002017203718424, + "kl": 0.01629612222313881, + "learning_rate": 2.386e-06, + "loss": 0.0008, + "num_tokens": 844999.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 31.75, + "completions/mean_terminated_length": 31.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 52.666666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3425886332988739, + "kl": 0.07018738985061646, + "learning_rate": 2.385666666666667e-06, + "loss": 0.0035, + "num_tokens": 845406.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 2844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 52.68518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.33491626381874084, + "kl": 0.028047680854797363, + "learning_rate": 2.3853333333333333e-06, + "loss": 0.0014, + "num_tokens": 845618.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 33.25, + "completions/mean_terminated_length": 33.25, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 52.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15327498316764832, + "kl": 0.06014397367835045, + "learning_rate": 2.385e-06, + "loss": 0.003, + "num_tokens": 845967.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 32.5, + "completions/mean_terminated_length": 32.5, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 52.72222222222222, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7377099990844727, + "kl": 0.06164982728660107, + "learning_rate": 2.384666666666667e-06, + "loss": -0.1307, + "num_tokens": 846325.0, + "reward": 7.75, + "reward_std": 0.28867512941360474, + "rewards/reward_combined/mean": 7.75, + "rewards/reward_combined/std": 0.28867512941360474, + "step": 2847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 52.74074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 10.34040641784668, + "kl": 0.047206103801727295, + "learning_rate": 2.3843333333333336e-06, + "loss": 0.0664, + "num_tokens": 846604.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 2848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 52.75925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.619198799133301, + "kl": 0.09781079739332199, + "learning_rate": 2.384e-06, + "loss": 0.0479, + "num_tokens": 846941.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 2849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 52.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06523977965116501, + "kl": 0.08886785805225372, + "learning_rate": 2.3836666666666667e-06, + "loss": 0.0044, + "num_tokens": 847305.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 52.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00035995987127535045, + "kl": 5.9567391872406006e-05, + "learning_rate": 2.383333333333333e-06, + "loss": 0.0, + "num_tokens": 847525.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 2851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 52.81481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04773977771401405, + "kl": 0.00898170773871243, + "learning_rate": 2.383e-06, + "loss": 0.0004, + "num_tokens": 847797.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 52.833333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03776170685887337, + "kl": 0.010953939985483885, + "learning_rate": 2.382666666666667e-06, + "loss": 0.0005, + "num_tokens": 848123.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 52.851851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07799394428730011, + "kl": 0.021837515902007, + "learning_rate": 2.3823333333333335e-06, + "loss": 0.0011, + "num_tokens": 848411.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 52.870370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008788693696260452, + "kl": 0.0017764195799827576, + "learning_rate": 2.3820000000000002e-06, + "loss": 0.0001, + "num_tokens": 848627.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 52.888888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08096367865800858, + "kl": 0.00670659338356927, + "learning_rate": 2.3816666666666666e-06, + "loss": 0.0003, + "num_tokens": 848941.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 52.907407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1365240067243576, + "kl": 0.006359230261296034, + "learning_rate": 2.3813333333333334e-06, + "loss": 0.0003, + "num_tokens": 849237.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 52.925925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017864925786852837, + "kl": 0.007465390954166651, + "learning_rate": 2.381e-06, + "loss": 0.0004, + "num_tokens": 849511.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 48.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 48.75, + "completions/mean_terminated_length": 48.75, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 52.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2301209270954132, + "kl": 0.024494120851159096, + "learning_rate": 2.380666666666667e-06, + "loss": 0.0012, + "num_tokens": 849926.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 52.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06736123561859131, + "kl": 0.009781391359865665, + "learning_rate": 2.3803333333333333e-06, + "loss": 0.0005, + "num_tokens": 850184.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 52.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02686416730284691, + "kl": 0.005065662087872624, + "learning_rate": 2.38e-06, + "loss": 0.0003, + "num_tokens": 850516.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 53.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10447411984205246, + "kl": 0.02131066471338272, + "learning_rate": 2.379666666666667e-06, + "loss": 0.0011, + "num_tokens": 850804.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 53.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2185179740190506, + "kl": 0.02052814792841673, + "learning_rate": 2.3793333333333332e-06, + "loss": 0.0011, + "num_tokens": 851080.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 53.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08419032394886017, + "kl": 0.036733237095177174, + "learning_rate": 2.379e-06, + "loss": 0.0018, + "num_tokens": 851352.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 53.05555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.869204521179199, + "kl": 0.16254862397909164, + "learning_rate": 2.378666666666667e-06, + "loss": 0.0596, + "num_tokens": 851665.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 2865 + }, + { + "clip_ratio/high_max": 0.010416666977107525, + "clip_ratio/high_mean": 0.010416666977107525, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.010416666977107525, + "completion_length": 26.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 26.25, + "completions/mean_terminated_length": 26.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 53.074074074074076, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.463963985443115, + "kl": 0.08973372355103493, + "learning_rate": 2.3783333333333336e-06, + "loss": -0.0053, + "num_tokens": 851990.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 2866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 53.092592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07112845778465271, + "kl": 0.028711308652418666, + "learning_rate": 2.378e-06, + "loss": 0.0014, + "num_tokens": 852278.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 53.111111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05987062305212021, + "kl": 0.007646935526281595, + "learning_rate": 2.3776666666666667e-06, + "loss": 0.0004, + "num_tokens": 852584.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 53.129629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03847935423254967, + "kl": 0.0003005564212799072, + "learning_rate": 2.377333333333333e-06, + "loss": 0.0, + "num_tokens": 852796.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 29.5, + "completions/mean_terminated_length": 29.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 53.148148148148145, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.799895763397217, + "kl": 0.025639529339969158, + "learning_rate": 2.377e-06, + "loss": 0.255, + "num_tokens": 853154.0, + "reward": 6.050000190734863, + "reward_std": 3.9000003337860107, + "rewards/reward_combined/mean": 6.050000190734863, + "rewards/reward_combined/std": 3.9000000953674316, + "step": 2870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 53.166666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0029934579506516457, + "kl": 0.0018062489107251167, + "learning_rate": 2.376666666666667e-06, + "loss": 0.0001, + "num_tokens": 853466.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 53.18518518518518, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.199644565582275, + "kl": 0.11968174437060952, + "learning_rate": 2.3763333333333334e-06, + "loss": 0.1412, + "num_tokens": 853790.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 2872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 45.0, + "completions/max_terminated_length": 45.0, + "completions/mean_length": 36.0, + "completions/mean_terminated_length": 36.0, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 53.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10542991757392883, + "kl": 0.039005622267723083, + "learning_rate": 2.376e-06, + "loss": 0.0021, + "num_tokens": 854158.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 53.22222222222222, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2771737575531006, + "kl": 0.06000687135383487, + "learning_rate": 2.3756666666666666e-06, + "loss": 0.0324, + "num_tokens": 854477.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 2874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 53.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010056917555630207, + "kl": 0.0006262307288125157, + "learning_rate": 2.3753333333333333e-06, + "loss": 0.0, + "num_tokens": 854747.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 53.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04362093284726143, + "kl": 0.006896126549690962, + "learning_rate": 2.375e-06, + "loss": 0.0003, + "num_tokens": 855035.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 53.27777777777778, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.952740669250488, + "kl": 0.015314777381718159, + "learning_rate": 2.374666666666667e-06, + "loss": 0.0794, + "num_tokens": 855371.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 2877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 53.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011432938277721405, + "kl": 0.001405414892360568, + "learning_rate": 2.3743333333333333e-06, + "loss": 0.0001, + "num_tokens": 855631.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 35.5, + "completions/mean_terminated_length": 35.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 53.31481481481482, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.015829563140869, + "kl": 0.10992373898625374, + "learning_rate": 2.374e-06, + "loss": -0.0708, + "num_tokens": 856001.0, + "reward": 6.625, + "reward_std": 2.428133726119995, + "rewards/reward_combined/mean": 6.625, + "rewards/reward_combined/std": 2.428133726119995, + "step": 2879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 53.333333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0546252615749836, + "kl": 0.0028338336560409516, + "learning_rate": 2.373666666666667e-06, + "loss": 0.0001, + "num_tokens": 856235.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 53.351851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048110343515872955, + "kl": 0.009807596215978265, + "learning_rate": 2.373333333333333e-06, + "loss": 0.0005, + "num_tokens": 856539.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 30.0, + "completions/mean_terminated_length": 30.0, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 53.370370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08813687413930893, + "kl": 0.034528578631579876, + "learning_rate": 2.373e-06, + "loss": 0.0017, + "num_tokens": 856875.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 53.388888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004411724396049976, + "kl": 0.016107586212456226, + "learning_rate": 2.3726666666666668e-06, + "loss": 0.0008, + "num_tokens": 857135.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 30.75, + "completions/mean_terminated_length": 30.75, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 53.407407407407405, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.162947654724121, + "kl": 0.06655381433665752, + "learning_rate": 2.3723333333333335e-06, + "loss": -0.0039, + "num_tokens": 857538.0, + "reward": 2.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.25, + "rewards/reward_combined/std": 1.5, + "step": 2884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 53.425925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13354608416557312, + "kl": 0.007072292268276215, + "learning_rate": 2.372e-06, + "loss": 0.0003, + "num_tokens": 857748.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 53.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06629502773284912, + "kl": 0.0019564018584787846, + "learning_rate": 2.3716666666666667e-06, + "loss": 0.0001, + "num_tokens": 858044.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 53.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0499211922287941, + "kl": 0.0028888892848044634, + "learning_rate": 2.371333333333333e-06, + "loss": 0.0001, + "num_tokens": 858287.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 53.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012173829600214958, + "kl": 0.05078907683491707, + "learning_rate": 2.3710000000000003e-06, + "loss": 0.0025, + "num_tokens": 858619.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 95.0, + "completions/max_terminated_length": 95.0, + "completions/mean_length": 26.75, + "completions/mean_terminated_length": 26.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 53.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.867785453796387, + "kl": 0.09830822050571442, + "learning_rate": 2.370666666666667e-06, + "loss": 0.4734, + "num_tokens": 858926.0, + "reward": 3.125, + "reward_std": 1.75, + "rewards/reward_combined/mean": 3.125, + "rewards/reward_combined/std": 1.75, + "step": 2889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 53.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05336439609527588, + "kl": 0.006315299076959491, + "learning_rate": 2.3703333333333334e-06, + "loss": 0.0003, + "num_tokens": 859222.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 53.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06559010595083237, + "kl": 0.00980394147336483, + "learning_rate": 2.37e-06, + "loss": 0.0005, + "num_tokens": 859496.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 53.55555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.212857246398926, + "kl": 0.029517007991671562, + "learning_rate": 2.3696666666666665e-06, + "loss": 0.1779, + "num_tokens": 859782.0, + "reward": 6.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 6.5, + "rewards/reward_combined/std": 2.0, + "step": 2892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 53.574074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014080351684242487, + "kl": 0.0013441123301163316, + "learning_rate": 2.3693333333333333e-06, + "loss": 0.0001, + "num_tokens": 860062.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 53.592592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16058064997196198, + "kl": 0.008736562798731029, + "learning_rate": 2.369e-06, + "loss": 0.0004, + "num_tokens": 860318.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 53.611111111111114, + "frac_reward_zero_std": 0.0, + "grad_norm": 12.153318405151367, + "kl": 0.01882866397500038, + "learning_rate": 2.368666666666667e-06, + "loss": 0.0026, + "num_tokens": 860578.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 2895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 53.629629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003658926289062947, + "kl": 6.495416164398193e-05, + "learning_rate": 2.3683333333333332e-06, + "loss": 0.0, + "num_tokens": 860798.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 2896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 53.648148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07831844687461853, + "kl": 0.0020904242992401123, + "learning_rate": 2.368e-06, + "loss": 0.0002, + "num_tokens": 861014.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 53.666666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02326568402349949, + "kl": 0.09351816400885582, + "learning_rate": 2.367666666666667e-06, + "loss": 0.0047, + "num_tokens": 861380.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 53.68518518518518, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.987786293029785, + "kl": 0.3220704160630703, + "learning_rate": 2.367333333333333e-06, + "loss": 0.0075, + "num_tokens": 861665.0, + "reward": 4.625, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 2899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 53.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08635083585977554, + "kl": 0.01907160598784685, + "learning_rate": 2.367e-06, + "loss": 0.001, + "num_tokens": 861953.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 56.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 143.0, + "completions/max_terminated_length": 143.0, + "completions/mean_length": 56.25, + "completions/mean_terminated_length": 56.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 53.72222222222222, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.488400459289551, + "kl": 0.07212156802415848, + "learning_rate": 2.3666666666666667e-06, + "loss": 0.331, + "num_tokens": 862430.0, + "reward": 5.875, + "reward_std": 3.25, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 3.25, + "step": 2901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 53.74074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9524062871932983, + "kl": 0.00367978448048234, + "learning_rate": 2.3663333333333335e-06, + "loss": -0.0, + "num_tokens": 862714.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 2902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 41.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 41.25, + "completions/mean_terminated_length": 41.25, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 53.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041116755455732346, + "kl": 0.0270707830786705, + "learning_rate": 2.366e-06, + "loss": 0.0011, + "num_tokens": 863099.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 53.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005555221810936928, + "kl": 0.0029350891709327698, + "learning_rate": 2.3656666666666667e-06, + "loss": 0.0001, + "num_tokens": 863335.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 53.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013231276534497738, + "kl": 0.266848623752594, + "learning_rate": 2.3653333333333334e-06, + "loss": 0.0133, + "num_tokens": 863639.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 53.81481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04293171688914299, + "kl": 0.008126812055706978, + "learning_rate": 2.3650000000000002e-06, + "loss": 0.0004, + "num_tokens": 863909.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 53.833333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.27496668696403503, + "kl": 0.028814285062253475, + "learning_rate": 2.364666666666667e-06, + "loss": 0.0014, + "num_tokens": 864241.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 53.851851851851855, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.342770576477051, + "kl": 0.10318545438349247, + "learning_rate": 2.3643333333333334e-06, + "loss": 0.0761, + "num_tokens": 864545.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 2908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 53.870370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02570682018995285, + "kl": 0.0009674452594481409, + "learning_rate": 2.364e-06, + "loss": 0.0, + "num_tokens": 864863.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 27.5, + "completions/mean_terminated_length": 27.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 53.888888888888886, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9785778522491455, + "kl": 0.14013474993407726, + "learning_rate": 2.3636666666666665e-06, + "loss": 0.1152, + "num_tokens": 865201.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 2910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 53.907407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001898481510579586, + "kl": 0.0007997065840754658, + "learning_rate": 2.3633333333333333e-06, + "loss": 0.0, + "num_tokens": 865461.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 53.925925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021994970738887787, + "kl": 0.002806131378747523, + "learning_rate": 2.363e-06, + "loss": 0.0001, + "num_tokens": 865741.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.5, + "completions/mean_terminated_length": 5.5, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 53.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22368617355823517, + "kl": 0.013450096594169736, + "learning_rate": 2.362666666666667e-06, + "loss": 0.0008, + "num_tokens": 865963.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 53.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06795808672904968, + "kl": 0.009241765830665827, + "learning_rate": 2.3623333333333332e-06, + "loss": 0.0005, + "num_tokens": 866293.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 53.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1688670814037323, + "kl": 0.025433705188333988, + "learning_rate": 2.362e-06, + "loss": 0.0013, + "num_tokens": 866593.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 54.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05443761497735977, + "kl": 0.007130228914320469, + "learning_rate": 2.3616666666666668e-06, + "loss": 0.0004, + "num_tokens": 866861.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 54.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023576179519295692, + "kl": 0.005491400370374322, + "learning_rate": 2.361333333333333e-06, + "loss": 0.0003, + "num_tokens": 867129.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 54.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09523707628250122, + "kl": 0.012822123942896724, + "learning_rate": 2.3610000000000003e-06, + "loss": 0.0008, + "num_tokens": 867391.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 35.25, + "completions/mean_terminated_length": 35.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 54.05555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.157207489013672, + "kl": 0.023810354061424732, + "learning_rate": 2.3606666666666667e-06, + "loss": 0.0399, + "num_tokens": 867752.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 2919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 54.074074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017751636914908886, + "kl": 3.5278499126434326e-05, + "learning_rate": 2.3603333333333335e-06, + "loss": 0.0, + "num_tokens": 867964.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 54.092592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.36503705382347107, + "kl": 0.03425721265375614, + "learning_rate": 2.36e-06, + "loss": 0.0017, + "num_tokens": 868270.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 54.111111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00037777298712171614, + "kl": 6.160885095596313e-05, + "learning_rate": 2.3596666666666666e-06, + "loss": 0.0, + "num_tokens": 868490.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 2922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 54.129629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06831710040569305, + "kl": 0.015107514336705208, + "learning_rate": 2.3593333333333334e-06, + "loss": 0.0008, + "num_tokens": 868774.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 54.148148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0028325184248387814, + "kl": 0.0003734744241228327, + "learning_rate": 2.359e-06, + "loss": 0.0, + "num_tokens": 869086.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 54.166666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07990197837352753, + "kl": 0.16134540736675262, + "learning_rate": 2.358666666666667e-06, + "loss": 0.0081, + "num_tokens": 869397.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 54.18518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17696452140808105, + "kl": 0.006129124900326133, + "learning_rate": 2.3583333333333333e-06, + "loss": 0.0003, + "num_tokens": 869671.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 54.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13193003833293915, + "kl": 0.010700775310397148, + "learning_rate": 2.358e-06, + "loss": 0.0005, + "num_tokens": 869961.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 54.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14096450805664062, + "kl": 0.04770764522254467, + "learning_rate": 2.3576666666666665e-06, + "loss": 0.0024, + "num_tokens": 870234.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 54.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011629119981080294, + "kl": 0.0013172925100661814, + "learning_rate": 2.3573333333333333e-06, + "loss": 0.0001, + "num_tokens": 870514.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 54.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007323765195906162, + "kl": 0.010137287434190512, + "learning_rate": 2.357e-06, + "loss": 0.0005, + "num_tokens": 870786.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 66.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 66.0, + "completions/mean_terminated_length": 2.6666667461395264, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 54.27777777777778, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.352905035018921, + "kl": 0.007515184581279755, + "learning_rate": 2.356666666666667e-06, + "loss": 0.4923, + "num_tokens": 871246.0, + "reward": 1.7999999523162842, + "reward_std": 3.4000000953674316, + "rewards/reward_combined/mean": 1.7999999523162842, + "rewards/reward_combined/std": 3.3999998569488525, + "step": 2931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 54.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006253378931432962, + "kl": 0.0027934685349464417, + "learning_rate": 2.356333333333333e-06, + "loss": 0.0001, + "num_tokens": 871482.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 31.75, + "completions/mean_terminated_length": 31.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 54.31481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04821046069264412, + "kl": 0.026681117713451385, + "learning_rate": 2.356e-06, + "loss": 0.0013, + "num_tokens": 871833.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 23.75, + "completions/mean_terminated_length": 23.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 54.333333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.48042118549346924, + "kl": 0.08606377243995667, + "learning_rate": 2.3556666666666668e-06, + "loss": 0.0042, + "num_tokens": 872164.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 54.351851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009499441832304, + "kl": 0.0004187636077404022, + "learning_rate": 2.3553333333333335e-06, + "loss": 0.0, + "num_tokens": 872408.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 54.370370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06627187877893448, + "kl": 0.008202591445297003, + "learning_rate": 2.3550000000000003e-06, + "loss": 0.0004, + "num_tokens": 872680.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 54.388888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4705772399902344, + "kl": 0.030181247740983963, + "learning_rate": 2.3546666666666667e-06, + "loss": 0.0013, + "num_tokens": 872899.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 27.25, + "completions/mean_terminated_length": 27.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 54.407407407407405, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.576265811920166, + "kl": 0.0294879162684083, + "learning_rate": 2.3543333333333335e-06, + "loss": 0.0471, + "num_tokens": 873236.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 2938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 54.425925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032603777945041656, + "kl": 0.002134602051228285, + "learning_rate": 2.354e-06, + "loss": 0.0001, + "num_tokens": 873504.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 54.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012353312224149704, + "kl": 0.003278389573097229, + "learning_rate": 2.3536666666666666e-06, + "loss": 0.0002, + "num_tokens": 873720.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.018518518656492233, + "clip_ratio/low_min": 0.018518518656492233, + "clip_ratio/region_mean": 0.018518518656492233, + "completion_length": 29.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 29.75, + "completions/mean_terminated_length": 29.75, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 54.46296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3510215282440186, + "kl": 0.36710023880004883, + "learning_rate": 2.3533333333333334e-06, + "loss": 0.0219, + "num_tokens": 874119.0, + "reward": 1.625, + "reward_std": 1.314977765083313, + "rewards/reward_combined/mean": 1.625, + "rewards/reward_combined/std": 1.3149778842926025, + "step": 2941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 54.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01726127415895462, + "kl": 0.005183990811929107, + "learning_rate": 2.353e-06, + "loss": 0.0003, + "num_tokens": 874408.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 29.5, + "completions/mean_terminated_length": 29.5, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 54.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3342766761779785, + "kl": 0.07593364268541336, + "learning_rate": 2.352666666666667e-06, + "loss": 0.0038, + "num_tokens": 874746.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 54.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11513447761535645, + "kl": 0.008505598991177976, + "learning_rate": 2.3523333333333333e-06, + "loss": 0.0004, + "num_tokens": 874980.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 54.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13031171262264252, + "kl": 0.03345475532114506, + "learning_rate": 2.352e-06, + "loss": 0.0017, + "num_tokens": 875276.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 24.75, + "completions/mean_terminated_length": 24.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 54.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10780889540910721, + "kl": 0.02938194014132023, + "learning_rate": 2.3516666666666665e-06, + "loss": 0.0015, + "num_tokens": 875603.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 54.574074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03160043805837631, + "kl": 0.001262330886675045, + "learning_rate": 2.3513333333333332e-06, + "loss": 0.0001, + "num_tokens": 875923.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 54.592592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07768853008747101, + "kl": 0.00892961397767067, + "learning_rate": 2.351e-06, + "loss": 0.0005, + "num_tokens": 876196.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 2.5, + "completions/mean_terminated_length": 2.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 54.611111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11657459288835526, + "kl": 0.004745267331600189, + "learning_rate": 2.350666666666667e-06, + "loss": 0.0003, + "num_tokens": 876410.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 54.629629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2351139485836029, + "kl": 0.06614034064114094, + "learning_rate": 2.3503333333333336e-06, + "loss": 0.0034, + "num_tokens": 876713.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 54.648148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.25175708532333374, + "kl": 0.030793271958827972, + "learning_rate": 2.35e-06, + "loss": 0.0016, + "num_tokens": 876995.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 54.666666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0694553479552269, + "kl": 0.0035574163775891066, + "learning_rate": 2.3496666666666667e-06, + "loss": 0.0002, + "num_tokens": 877291.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 54.68518518518518, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.575503826141357, + "kl": 0.1543746218085289, + "learning_rate": 2.3493333333333335e-06, + "loss": 0.0365, + "num_tokens": 877643.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 2953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 54.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03691914677619934, + "kl": 0.014725782479217742, + "learning_rate": 2.3490000000000003e-06, + "loss": 0.0008, + "num_tokens": 877929.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 54.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046783048659563065, + "kl": 0.0012613177314051427, + "learning_rate": 2.3486666666666667e-06, + "loss": 0.0001, + "num_tokens": 878185.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 54.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2654917538166046, + "kl": 0.017351491376757622, + "learning_rate": 2.3483333333333334e-06, + "loss": 0.0009, + "num_tokens": 878515.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 54.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12645743787288666, + "kl": 0.02534924726933241, + "learning_rate": 2.348e-06, + "loss": 0.0013, + "num_tokens": 878849.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 54.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01918371580541134, + "kl": 0.0005114320665597916, + "learning_rate": 2.3476666666666666e-06, + "loss": 0.0, + "num_tokens": 879161.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 54.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0019872759003192186, + "kl": 0.0004409998655319214, + "learning_rate": 2.3473333333333334e-06, + "loss": 0.0, + "num_tokens": 879433.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 54.81481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01509452611207962, + "kl": 0.2664918303489685, + "learning_rate": 2.347e-06, + "loss": 0.0133, + "num_tokens": 879737.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 54.833333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12018117308616638, + "kl": 0.012594382744282484, + "learning_rate": 2.346666666666667e-06, + "loss": 0.0007, + "num_tokens": 880061.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 54.851851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010643661953508854, + "kl": 0.0010574540356174111, + "learning_rate": 2.3463333333333333e-06, + "loss": 0.0001, + "num_tokens": 880323.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 54.870370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09327709674835205, + "kl": 0.08426785469055176, + "learning_rate": 2.346e-06, + "loss": 0.0042, + "num_tokens": 880687.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 54.888888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008367214351892471, + "kl": 0.0008837968052830547, + "learning_rate": 2.3456666666666664e-06, + "loss": 0.0, + "num_tokens": 880947.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 30.5, + "completions/mean_terminated_length": 30.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 54.907407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3067464232444763, + "kl": 0.07646085321903229, + "learning_rate": 2.3453333333333336e-06, + "loss": 0.003, + "num_tokens": 881285.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 38.25, + "completions/mean_terminated_length": 38.25, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 54.925925925925924, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5075974464416504, + "kl": 0.057480476796627045, + "learning_rate": 2.345e-06, + "loss": 0.0485, + "num_tokens": 881666.0, + "reward": 6.125, + "reward_std": 3.4247870445251465, + "rewards/reward_combined/mean": 6.125, + "rewards/reward_combined/std": 3.4247870445251465, + "step": 2966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 54.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046370603144168854, + "kl": 0.007213607896119356, + "learning_rate": 2.3446666666666668e-06, + "loss": 0.0004, + "num_tokens": 882000.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 54.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0035865188110619783, + "kl": 0.016245152801275253, + "learning_rate": 2.3443333333333336e-06, + "loss": 0.0008, + "num_tokens": 882260.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 54.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08152646571397781, + "kl": 0.008194145280867815, + "learning_rate": 2.344e-06, + "loss": 0.0004, + "num_tokens": 882549.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 55.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025813622400164604, + "kl": 0.0010362975299358368, + "learning_rate": 2.3436666666666667e-06, + "loss": 0.0001, + "num_tokens": 882809.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 26.75, + "completions/mean_terminated_length": 26.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 55.01851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.762274980545044, + "kl": 0.11066636070609093, + "learning_rate": 2.3433333333333335e-06, + "loss": 0.0606, + "num_tokens": 883152.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 2971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 55.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06702513992786407, + "kl": 0.027682156302034855, + "learning_rate": 2.3430000000000003e-06, + "loss": 0.0014, + "num_tokens": 883455.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 55.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3494166135787964, + "kl": 0.04059334844350815, + "learning_rate": 2.3426666666666666e-06, + "loss": 0.0017, + "num_tokens": 883773.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 46.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 46.5, + "completions/mean_terminated_length": 46.5, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 55.074074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041744161397218704, + "kl": 0.012252789922058582, + "learning_rate": 2.3423333333333334e-06, + "loss": 0.0006, + "num_tokens": 884179.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 55.092592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07226499915122986, + "kl": 0.007680713664740324, + "learning_rate": 2.3419999999999998e-06, + "loss": 0.0004, + "num_tokens": 884449.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 34.25, + "completions/mean_terminated_length": 34.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 55.111111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23819291591644287, + "kl": 0.04472205974161625, + "learning_rate": 2.3416666666666666e-06, + "loss": 0.0022, + "num_tokens": 884810.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 55.129629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003592208959162235, + "kl": 6.392598152160645e-05, + "learning_rate": 2.3413333333333338e-06, + "loss": 0.0, + "num_tokens": 885030.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 2977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 55.148148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23653821647167206, + "kl": 0.02786953785107471, + "learning_rate": 2.341e-06, + "loss": 0.0014, + "num_tokens": 885298.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 55.166666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1875661462545395, + "kl": 0.015617812983691692, + "learning_rate": 2.340666666666667e-06, + "loss": 0.0007, + "num_tokens": 885532.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 55.18518518518518, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.101525783538818, + "kl": 0.10640087351202965, + "learning_rate": 2.3403333333333333e-06, + "loss": 0.0224, + "num_tokens": 885854.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 2980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 55.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.115035280585289, + "kl": 0.03137406148016453, + "learning_rate": 2.34e-06, + "loss": 0.0016, + "num_tokens": 886148.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 29.0, + "completions/mean_terminated_length": 29.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 55.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10560853779315948, + "kl": 0.022690760903060436, + "learning_rate": 2.339666666666667e-06, + "loss": 0.0012, + "num_tokens": 886492.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 55.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039279937744140625, + "kl": 0.008485760539770126, + "learning_rate": 2.3393333333333336e-06, + "loss": 0.0004, + "num_tokens": 886780.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 55.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07722441107034683, + "kl": 0.010541489813476801, + "learning_rate": 2.339e-06, + "loss": 0.0005, + "num_tokens": 887086.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 84.25, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 84.25, + "completions/mean_terminated_length": 27.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 55.27777777777778, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4168365001678467, + "kl": 0.03890683501958847, + "learning_rate": 2.3386666666666668e-06, + "loss": 0.3008, + "num_tokens": 887639.0, + "reward": 3.424999952316284, + "reward_std": 3.797696828842163, + "rewards/reward_combined/mean": 3.424999952316284, + "rewards/reward_combined/std": 3.797696590423584, + "step": 2985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 55.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1464974731206894, + "kl": 0.019229216501116753, + "learning_rate": 2.3383333333333335e-06, + "loss": 0.001, + "num_tokens": 887923.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 82.75, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 82.75, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 55.31481481481482, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5898351669311523, + "kl": 0.026735293678939342, + "learning_rate": 2.338e-06, + "loss": 0.4322, + "num_tokens": 888506.0, + "reward": 5.550000190734863, + "reward_std": 3.9000000953674316, + "rewards/reward_combined/mean": 5.550000190734863, + "rewards/reward_combined/std": 3.9000000953674316, + "step": 2987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 55.333333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02812797762453556, + "kl": 0.003407878102734685, + "learning_rate": 2.3376666666666667e-06, + "loss": 0.0002, + "num_tokens": 888802.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 55.351851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04721495509147644, + "kl": 0.007408793084323406, + "learning_rate": 2.3373333333333335e-06, + "loss": 0.0004, + "num_tokens": 889123.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 55.370370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2649492621421814, + "kl": 0.019243795075453818, + "learning_rate": 2.3370000000000002e-06, + "loss": 0.0006, + "num_tokens": 889377.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 34.5, + "completions/mean_terminated_length": 34.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 55.388888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055546633899211884, + "kl": 0.026678916066884995, + "learning_rate": 2.3366666666666666e-06, + "loss": 0.0013, + "num_tokens": 889795.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 2991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 55.407407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.46795764565467834, + "kl": 0.0460997000336647, + "learning_rate": 2.3363333333333334e-06, + "loss": 0.0023, + "num_tokens": 890055.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 55.425925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006224581506103277, + "kl": 0.00042625516653060913, + "learning_rate": 2.3359999999999997e-06, + "loss": 0.0, + "num_tokens": 890299.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 2993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 55.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7861798405647278, + "kl": 0.10501637309789658, + "learning_rate": 2.3356666666666665e-06, + "loss": 0.0056, + "num_tokens": 890613.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 2994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 55.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3208851218223572, + "kl": 0.04298049118369818, + "learning_rate": 2.3353333333333337e-06, + "loss": 0.0022, + "num_tokens": 890904.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 55.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07854863256216049, + "kl": 0.15339665114879608, + "learning_rate": 2.335e-06, + "loss": 0.0077, + "num_tokens": 891217.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 55.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008497952483594418, + "kl": 0.00971553847193718, + "learning_rate": 2.334666666666667e-06, + "loss": 0.0005, + "num_tokens": 891489.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 2997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 55.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04913301765918732, + "kl": 0.005250135902315378, + "learning_rate": 2.3343333333333332e-06, + "loss": 0.0003, + "num_tokens": 891761.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 55.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05256142467260361, + "kl": 0.010212956462055445, + "learning_rate": 2.334e-06, + "loss": 0.0005, + "num_tokens": 892052.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 2999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 55.55555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.948071241378784, + "kl": 0.007339737727306783, + "learning_rate": 2.333666666666667e-06, + "loss": -0.0483, + "num_tokens": 892343.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 3000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 55.574074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006259546149522066, + "kl": 0.0010361314052715898, + "learning_rate": 2.3333333333333336e-06, + "loss": 0.0001, + "num_tokens": 892603.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 55.592592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004210996441543102, + "kl": 0.016116227954626083, + "learning_rate": 2.333e-06, + "loss": 0.0008, + "num_tokens": 892863.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 55.611111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001128842937760055, + "kl": 0.0012331880861893296, + "learning_rate": 2.3326666666666667e-06, + "loss": 0.0001, + "num_tokens": 893143.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 55.629629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006712694652378559, + "kl": 0.0003460347652435303, + "learning_rate": 2.3323333333333335e-06, + "loss": 0.0, + "num_tokens": 893363.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 55.648148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10337196290493011, + "kl": 0.009352536872029305, + "learning_rate": 2.332e-06, + "loss": 0.0005, + "num_tokens": 893633.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 55.666666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21204231679439545, + "kl": 0.014737354591488838, + "learning_rate": 2.3316666666666666e-06, + "loss": 0.0007, + "num_tokens": 893899.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3006 + }, + { + "clip_ratio/high_max": 0.008064515888690948, + "clip_ratio/high_mean": 0.008064515888690948, + "clip_ratio/low_mean": 0.007936508394777775, + "clip_ratio/low_min": 0.007936508394777775, + "clip_ratio/region_mean": 0.016001024283468723, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 55.68518518518518, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7153398990631104, + "kl": 0.07125603780150414, + "learning_rate": 2.3313333333333334e-06, + "loss": 0.032, + "num_tokens": 894252.0, + "reward": 6.0, + "reward_std": 3.34165620803833, + "rewards/reward_combined/mean": 6.0, + "rewards/reward_combined/std": 3.34165620803833, + "step": 3007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 55.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10559558868408203, + "kl": 0.002482399344444275, + "learning_rate": 2.3310000000000002e-06, + "loss": 0.0001, + "num_tokens": 894464.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.009259259328246117, + "clip_ratio/low_min": 0.009259259328246117, + "clip_ratio/region_mean": 0.009259259328246117, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 55.72222222222222, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.876154899597168, + "kl": 0.5876728873699903, + "learning_rate": 2.3306666666666666e-06, + "loss": 0.0493, + "num_tokens": 894780.0, + "reward": 2.375, + "reward_std": 1.8874585628509521, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 1.8874585628509521, + "step": 3009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 55.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16329513490200043, + "kl": 0.020946836099028587, + "learning_rate": 2.3303333333333334e-06, + "loss": 0.001, + "num_tokens": 895062.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 55.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07556506991386414, + "kl": 0.02021302655339241, + "learning_rate": 2.3299999999999997e-06, + "loss": 0.001, + "num_tokens": 895394.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 55.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021483929827809334, + "kl": 0.0024198753526434302, + "learning_rate": 2.329666666666667e-06, + "loss": 0.0001, + "num_tokens": 895726.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 55.7962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 13.228107452392578, + "kl": 0.031029794365167618, + "learning_rate": 2.3293333333333337e-06, + "loss": 0.1464, + "num_tokens": 895965.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 3013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 55.81481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.369430273771286, + "kl": 0.02256050705909729, + "learning_rate": 2.329e-06, + "loss": 0.0011, + "num_tokens": 896181.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 55.833333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05093751475214958, + "kl": 0.09115274250507355, + "learning_rate": 2.328666666666667e-06, + "loss": 0.0046, + "num_tokens": 896547.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.012195121496915817, + "clip_ratio/low_min": 0.012195121496915817, + "clip_ratio/region_mean": 0.012195121496915817, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 55.851851851851855, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.179732322692871, + "kl": 0.002333106007426977, + "learning_rate": 2.328333333333333e-06, + "loss": 0.0557, + "num_tokens": 896864.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 3016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 55.870370370370374, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.867720365524292, + "kl": 0.05079939030110836, + "learning_rate": 2.328e-06, + "loss": -0.0267, + "num_tokens": 897169.0, + "reward": 4.5, + "reward_std": 2.345207929611206, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 2.345207929611206, + "step": 3017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 55.888888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021309273317456245, + "kl": 0.26525846123695374, + "learning_rate": 2.3276666666666668e-06, + "loss": 0.0133, + "num_tokens": 897473.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 55.907407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016170399263501167, + "kl": 0.00035906137782149017, + "learning_rate": 2.3273333333333336e-06, + "loss": 0.0, + "num_tokens": 897729.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 55.925925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0910065621137619, + "kl": 0.007152083155233413, + "learning_rate": 2.327e-06, + "loss": 0.0003, + "num_tokens": 898008.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 55.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20022736489772797, + "kl": 0.011388115584850311, + "learning_rate": 2.3266666666666667e-06, + "loss": 0.0006, + "num_tokens": 898220.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 55.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031021635979413986, + "kl": 0.0023814737796783447, + "learning_rate": 2.3263333333333335e-06, + "loss": 0.0001, + "num_tokens": 898432.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 55.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01577245071530342, + "kl": 0.0012783058336935937, + "learning_rate": 2.326e-06, + "loss": 0.0001, + "num_tokens": 898706.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 56.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15930642187595367, + "kl": 0.03882480412721634, + "learning_rate": 2.3256666666666666e-06, + "loss": 0.0019, + "num_tokens": 898981.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 56.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002751645166426897, + "kl": 0.0033688247203826904, + "learning_rate": 2.3253333333333334e-06, + "loss": 0.0002, + "num_tokens": 899217.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 56.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015981562435626984, + "kl": 0.0035631279461085796, + "learning_rate": 2.325e-06, + "loss": 0.0002, + "num_tokens": 899509.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 56.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12137234210968018, + "kl": 0.005759598687291145, + "learning_rate": 2.3246666666666665e-06, + "loss": 0.0003, + "num_tokens": 899771.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 56.074074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0036598711740225554, + "kl": 0.016190843656659126, + "learning_rate": 2.3243333333333333e-06, + "loss": 0.0008, + "num_tokens": 900031.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.25, + "completions/mean_terminated_length": 6.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 56.092592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06579843908548355, + "kl": 0.0029603760922327638, + "learning_rate": 2.324e-06, + "loss": 0.0001, + "num_tokens": 900264.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 56.111111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042218174785375595, + "kl": 0.024056091904640198, + "learning_rate": 2.323666666666667e-06, + "loss": 0.0013, + "num_tokens": 900553.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 56.129629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022274121642112732, + "kl": 0.2651331424713135, + "learning_rate": 2.3233333333333337e-06, + "loss": 0.0133, + "num_tokens": 900857.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 56.148148148148145, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.141805171966553, + "kl": 0.07211090251803398, + "learning_rate": 2.323e-06, + "loss": 0.0458, + "num_tokens": 901153.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 3032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 56.166666666666664, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.002310037612915, + "kl": 0.10307259391993284, + "learning_rate": 2.322666666666667e-06, + "loss": -0.0001, + "num_tokens": 901441.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 3033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 56.18518518518518, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.237639904022217, + "kl": 0.0351610891520977, + "learning_rate": 2.322333333333333e-06, + "loss": 0.0439, + "num_tokens": 901715.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 3034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 56.2037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2782034873962402, + "kl": 0.025777772068977356, + "learning_rate": 2.322e-06, + "loss": 0.0102, + "num_tokens": 902046.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 3035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 56.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07503416389226913, + "kl": 0.004768489394336939, + "learning_rate": 2.3216666666666667e-06, + "loss": 0.0002, + "num_tokens": 902364.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 56.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05003504827618599, + "kl": 0.010098483297042549, + "learning_rate": 2.3213333333333335e-06, + "loss": 0.0005, + "num_tokens": 902691.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 34.0, + "completions/mean_terminated_length": 34.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 56.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3545469641685486, + "kl": 0.061767819337546825, + "learning_rate": 2.321e-06, + "loss": 0.0032, + "num_tokens": 903047.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 56.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020279807969927788, + "kl": 0.0008650238742120564, + "learning_rate": 2.3206666666666667e-06, + "loss": 0.0, + "num_tokens": 903317.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 56.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026220832020044327, + "kl": 0.006930738687515259, + "learning_rate": 2.3203333333333335e-06, + "loss": 0.0003, + "num_tokens": 903585.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 56.31481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04772263392806053, + "kl": 0.005179248750209808, + "learning_rate": 2.32e-06, + "loss": 0.0003, + "num_tokens": 903795.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 56.333333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3156445324420929, + "kl": 0.09944610297679901, + "learning_rate": 2.319666666666667e-06, + "loss": 0.0043, + "num_tokens": 904153.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 56.351851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012312165461480618, + "kl": 0.0016637109220027924, + "learning_rate": 2.3193333333333334e-06, + "loss": 0.0001, + "num_tokens": 904465.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 56.370370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.618168354034424, + "kl": 0.3945522963767871, + "learning_rate": 2.319e-06, + "loss": 0.0158, + "num_tokens": 904725.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 56.388888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10125128924846649, + "kl": 0.019602932035923004, + "learning_rate": 2.3186666666666665e-06, + "loss": 0.001, + "num_tokens": 905027.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 56.407407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.26650863885879517, + "kl": 0.05305817723274231, + "learning_rate": 2.3183333333333333e-06, + "loss": 0.0027, + "num_tokens": 905323.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 56.425925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028388069942593575, + "kl": 0.0015472486848011613, + "learning_rate": 2.318e-06, + "loss": 0.0001, + "num_tokens": 905577.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 56.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050574012100696564, + "kl": 0.010240362957119942, + "learning_rate": 2.317666666666667e-06, + "loss": 0.0005, + "num_tokens": 905883.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 56.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1809522956609726, + "kl": 0.01540428027510643, + "learning_rate": 2.3173333333333336e-06, + "loss": 0.0008, + "num_tokens": 906143.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 12.75, + "completions/mean_terminated_length": 12.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 56.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3351411819458008, + "kl": 0.04343859851360321, + "learning_rate": 2.317e-06, + "loss": 0.0027, + "num_tokens": 906414.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 56.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.26799216866493225, + "kl": 0.04025677964091301, + "learning_rate": 2.316666666666667e-06, + "loss": 0.002, + "num_tokens": 906732.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 30.75, + "completions/mean_terminated_length": 30.75, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 56.51851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.867607593536377, + "kl": 0.1391044482588768, + "learning_rate": 2.316333333333333e-06, + "loss": -0.0006, + "num_tokens": 907135.0, + "reward": 2.375, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 1.25, + "step": 3052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 56.53703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.280645847320557, + "kl": 0.009545482462272048, + "learning_rate": 2.316e-06, + "loss": 0.0354, + "num_tokens": 907462.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 3053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 56.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00037280612741596997, + "kl": 4.4927000999450684e-05, + "learning_rate": 2.3156666666666667e-06, + "loss": 0.0, + "num_tokens": 907682.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 3054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 56.574074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006934404838830233, + "kl": 0.0004408538370626047, + "learning_rate": 2.3153333333333335e-06, + "loss": 0.0, + "num_tokens": 907902.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 56.592592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0731884092092514, + "kl": 0.005249355337582529, + "learning_rate": 2.315e-06, + "loss": 0.0003, + "num_tokens": 908172.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 56.611111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004869671072810888, + "kl": 0.0001500844955444336, + "learning_rate": 2.3146666666666666e-06, + "loss": 0.0, + "num_tokens": 908384.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 56.629629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.29635345935821533, + "kl": 0.0346414668019861, + "learning_rate": 2.3143333333333334e-06, + "loss": 0.0017, + "num_tokens": 908668.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 32.75, + "completions/mean_terminated_length": 32.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 56.648148148148145, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1607277393341064, + "kl": 0.0721521582454443, + "learning_rate": 2.314e-06, + "loss": -0.0737, + "num_tokens": 909023.0, + "reward": 5.25, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 2.598076105117798, + "step": 3059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 56.666666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07834595441818237, + "kl": 0.005899720126762986, + "learning_rate": 2.313666666666667e-06, + "loss": 0.0003, + "num_tokens": 909307.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 56.68518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.37481996417045593, + "kl": 0.04803141113370657, + "learning_rate": 2.3133333333333333e-06, + "loss": 0.0029, + "num_tokens": 909589.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 56.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06571433693170547, + "kl": 0.003023725701496005, + "learning_rate": 2.313e-06, + "loss": 0.0002, + "num_tokens": 909845.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 56.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027004772797226906, + "kl": 0.0009928946965374053, + "learning_rate": 2.3126666666666665e-06, + "loss": 0.0, + "num_tokens": 910088.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 56.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12884853780269623, + "kl": 0.0746908187866211, + "learning_rate": 2.3123333333333333e-06, + "loss": 0.0037, + "num_tokens": 910452.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 56.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024170802906155586, + "kl": 0.0031657739309594035, + "learning_rate": 2.312e-06, + "loss": 0.0002, + "num_tokens": 910750.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 56.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06535699218511581, + "kl": 0.00893158046528697, + "learning_rate": 2.311666666666667e-06, + "loss": 0.0004, + "num_tokens": 911039.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 56.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004719328135251999, + "kl": 0.0029603242874145508, + "learning_rate": 2.3113333333333336e-06, + "loss": 0.0001, + "num_tokens": 911255.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 32.0, + "completions/mean_terminated_length": 32.0, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 56.81481481481482, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5798566341400146, + "kl": 0.07439298555254936, + "learning_rate": 2.311e-06, + "loss": 0.0018, + "num_tokens": 911611.0, + "reward": 7.75, + "reward_std": 0.28867512941360474, + "rewards/reward_combined/mean": 7.75, + "rewards/reward_combined/std": 0.28867512941360474, + "step": 3068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 24.25, + "completions/mean_terminated_length": 24.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 56.833333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06750567257404327, + "kl": 0.05538497120141983, + "learning_rate": 2.3106666666666668e-06, + "loss": 0.0028, + "num_tokens": 911944.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 56.851851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016536394134163857, + "kl": 0.001628221827559173, + "learning_rate": 2.310333333333333e-06, + "loss": 0.0001, + "num_tokens": 912221.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 56.870370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05181695520877838, + "kl": 0.0023673070245422423, + "learning_rate": 2.31e-06, + "loss": 0.0001, + "num_tokens": 912532.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 56.888888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08608939498662949, + "kl": 0.15065090358257294, + "learning_rate": 2.3096666666666667e-06, + "loss": 0.0075, + "num_tokens": 912849.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 56.907407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07637684792280197, + "kl": 0.015827403403818607, + "learning_rate": 2.3093333333333335e-06, + "loss": 0.0008, + "num_tokens": 913177.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 56.925925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09212353825569153, + "kl": 0.0043697357177734375, + "learning_rate": 2.3090000000000003e-06, + "loss": 0.0002, + "num_tokens": 913389.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 56.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018798593431711197, + "kl": 0.0020437620114535093, + "learning_rate": 2.3086666666666666e-06, + "loss": 0.0001, + "num_tokens": 913657.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 85.0, + "completions/max_terminated_length": 85.0, + "completions/mean_length": 31.75, + "completions/mean_terminated_length": 31.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 56.96296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.522083282470703, + "kl": 0.027398478239774704, + "learning_rate": 2.3083333333333334e-06, + "loss": 0.3735, + "num_tokens": 914000.0, + "reward": 5.75, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 4.5, + "step": 3076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 24.75, + "completions/mean_terminated_length": 24.75, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 56.98148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.773656368255615, + "kl": 0.14111773297190666, + "learning_rate": 2.308e-06, + "loss": -0.0063, + "num_tokens": 914315.0, + "reward": 2.75, + "reward_std": 3.9686269760131836, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 3.9686269760131836, + "step": 3077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 57.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09969069063663483, + "kl": 0.008085519599262625, + "learning_rate": 2.307666666666667e-06, + "loss": 0.0004, + "num_tokens": 914660.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 57.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.306377649307251, + "kl": 0.12525998149067163, + "learning_rate": 2.3073333333333333e-06, + "loss": 0.0067, + "num_tokens": 914934.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 57.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13874602317810059, + "kl": 0.03274434059858322, + "learning_rate": 2.307e-06, + "loss": 0.0016, + "num_tokens": 915228.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 57.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06902331858873367, + "kl": 0.010074528399854898, + "learning_rate": 2.3066666666666665e-06, + "loss": 0.0005, + "num_tokens": 915558.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 23.75, + "completions/mean_terminated_length": 23.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 57.074074074074076, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.246304035186768, + "kl": 0.00846839026780799, + "learning_rate": 2.3063333333333332e-06, + "loss": 0.0727, + "num_tokens": 915897.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 3082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 57.092592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16027773916721344, + "kl": 0.0288890665397048, + "learning_rate": 2.306e-06, + "loss": 0.0014, + "num_tokens": 916169.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 57.111111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003709029115270823, + "kl": 4.108995199203491e-05, + "learning_rate": 2.305666666666667e-06, + "loss": 0.0, + "num_tokens": 916389.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 3084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 27.0, + "completions/mean_terminated_length": 27.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 57.129629629629626, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7916741371154785, + "kl": 0.0329713923856616, + "learning_rate": 2.3053333333333336e-06, + "loss": 0.1511, + "num_tokens": 916725.0, + "reward": 6.75, + "reward_std": 2.5, + "rewards/reward_combined/mean": 6.75, + "rewards/reward_combined/std": 2.5, + "step": 3085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 57.148148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007831676630303264, + "kl": 0.0012073814868927002, + "learning_rate": 2.305e-06, + "loss": 0.0001, + "num_tokens": 917005.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.75, + "completions/mean_terminated_length": 10.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 57.166666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03505430370569229, + "kl": 0.005548796383664012, + "learning_rate": 2.3046666666666667e-06, + "loss": 0.0003, + "num_tokens": 917276.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 57.18518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05358888581395149, + "kl": 0.0027505953039508313, + "learning_rate": 2.304333333333333e-06, + "loss": 0.0001, + "num_tokens": 917511.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 57.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005964314099401236, + "kl": 0.002862304449081421, + "learning_rate": 2.3040000000000003e-06, + "loss": 0.0001, + "num_tokens": 917727.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 57.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0240944791585207, + "kl": 0.00037530362169491127, + "learning_rate": 2.3036666666666667e-06, + "loss": 0.0, + "num_tokens": 917983.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 57.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030668575316667557, + "kl": 0.2634844481945038, + "learning_rate": 2.3033333333333334e-06, + "loss": 0.0132, + "num_tokens": 918287.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 57.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03100777044892311, + "kl": 0.001269012689590454, + "learning_rate": 2.3030000000000002e-06, + "loss": 0.0001, + "num_tokens": 918557.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 57.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06761438399553299, + "kl": 0.010994312353432178, + "learning_rate": 2.3026666666666666e-06, + "loss": 0.0005, + "num_tokens": 918861.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 57.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0031846552155911922, + "kl": 6.537884473800659e-05, + "learning_rate": 2.3023333333333334e-06, + "loss": 0.0, + "num_tokens": 919073.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 57.31481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11781570315361023, + "kl": 0.012454411946237087, + "learning_rate": 2.302e-06, + "loss": 0.0007, + "num_tokens": 919339.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 90.25, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 90.25, + "completions/mean_terminated_length": 35.0, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 57.333333333333336, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5489165782928467, + "kl": 0.05572972074151039, + "learning_rate": 2.301666666666667e-06, + "loss": 0.3391, + "num_tokens": 919916.0, + "reward": 1.625, + "reward_std": 2.25, + "rewards/reward_combined/mean": 1.625, + "rewards/reward_combined/std": 2.25, + "step": 3096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 57.351851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025521112605929375, + "kl": 0.10341102629899979, + "learning_rate": 2.3013333333333333e-06, + "loss": 0.0053, + "num_tokens": 920284.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 57.370370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10133406519889832, + "kl": 0.007326021790504456, + "learning_rate": 2.301e-06, + "loss": 0.0004, + "num_tokens": 920528.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 57.388888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03059845231473446, + "kl": 0.0007258206605911255, + "learning_rate": 2.3006666666666664e-06, + "loss": 0.0, + "num_tokens": 920740.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 57.407407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060716625303030014, + "kl": 0.0074819540604949, + "learning_rate": 2.3003333333333332e-06, + "loss": 0.0004, + "num_tokens": 921028.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 57.425925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10100864619016647, + "kl": 0.02005849126726389, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.001, + "num_tokens": 921312.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 32.0, + "completions/mean_terminated_length": 32.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 57.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10387787967920303, + "kl": 0.0436395313590765, + "learning_rate": 2.2996666666666668e-06, + "loss": 0.0022, + "num_tokens": 921720.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 3102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 57.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024397222325205803, + "kl": 0.0027708099223673344, + "learning_rate": 2.2993333333333336e-06, + "loss": 0.0001, + "num_tokens": 922016.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 57.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0444328710436821, + "kl": 0.004600699990987778, + "learning_rate": 2.299e-06, + "loss": 0.0002, + "num_tokens": 922328.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 57.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1559734046459198, + "kl": 0.012478619813919067, + "learning_rate": 2.2986666666666667e-06, + "loss": 0.0006, + "num_tokens": 922588.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 57.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04167880862951279, + "kl": 0.009483292698860168, + "learning_rate": 2.2983333333333335e-06, + "loss": 0.0005, + "num_tokens": 922879.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 57.53703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.077730417251587, + "kl": 0.05038454011082649, + "learning_rate": 2.2980000000000003e-06, + "loss": 0.0486, + "num_tokens": 923218.0, + "reward": 6.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 6.5, + "rewards/reward_combined/std": 2.0, + "step": 3107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 57.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011280560865998268, + "kl": 0.000858005863847211, + "learning_rate": 2.2976666666666666e-06, + "loss": 0.0, + "num_tokens": 923484.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 57.574074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028662335127592087, + "kl": 0.005231330171227455, + "learning_rate": 2.2973333333333334e-06, + "loss": 0.0003, + "num_tokens": 923788.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 57.592592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2523195445537567, + "kl": 0.017590429866686463, + "learning_rate": 2.297e-06, + "loss": 0.0008, + "num_tokens": 924056.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 57.611111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027332987636327744, + "kl": 0.0031800430733710527, + "learning_rate": 2.2966666666666666e-06, + "loss": 0.0002, + "num_tokens": 924338.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 29.5, + "completions/mean_terminated_length": 29.5, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 57.629629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12399964779615402, + "kl": 0.03697334788739681, + "learning_rate": 2.2963333333333333e-06, + "loss": 0.0019, + "num_tokens": 924708.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 47.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 47.5, + "completions/mean_terminated_length": 47.5, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 57.648148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08509934693574905, + "kl": 0.018704267218708992, + "learning_rate": 2.296e-06, + "loss": 0.0009, + "num_tokens": 925118.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 57.666666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004579869564622641, + "kl": 0.0002828136057360098, + "learning_rate": 2.295666666666667e-06, + "loss": 0.0, + "num_tokens": 925432.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 57.68518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003519898047670722, + "kl": 0.01617850735783577, + "learning_rate": 2.2953333333333333e-06, + "loss": 0.0008, + "num_tokens": 925692.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 57.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026258887723088264, + "kl": 0.0015709161525592208, + "learning_rate": 2.295e-06, + "loss": 0.0001, + "num_tokens": 925952.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3116 + }, + { + "clip_ratio/high_max": 0.011627906933426857, + "clip_ratio/high_mean": 0.011627906933426857, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.011627906933426857, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 57.72222222222222, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.801479816436768, + "kl": 0.02480255998671055, + "learning_rate": 2.2946666666666664e-06, + "loss": -0.0741, + "num_tokens": 926264.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 3117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 57.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04094352573156357, + "kl": 0.01576678641140461, + "learning_rate": 2.294333333333333e-06, + "loss": 0.0008, + "num_tokens": 926570.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 57.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0319712795317173, + "kl": 0.0007952234009280801, + "learning_rate": 2.2940000000000004e-06, + "loss": 0.0, + "num_tokens": 926854.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 57.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05532447248697281, + "kl": 0.007991239661350846, + "learning_rate": 2.2936666666666668e-06, + "loss": 0.0004, + "num_tokens": 927176.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 36.0, + "completions/mean_terminated_length": 36.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 57.7962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8568344116210938, + "kl": 0.14629393070936203, + "learning_rate": 2.2933333333333335e-06, + "loss": -0.0124, + "num_tokens": 927548.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 3121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.5, + "completions/mean_terminated_length": 5.5, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 57.81481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04195952042937279, + "kl": 0.0013035978190600872, + "learning_rate": 2.293e-06, + "loss": 0.0001, + "num_tokens": 927770.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 57.833333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07107017189264297, + "kl": 0.006461275741457939, + "learning_rate": 2.2926666666666667e-06, + "loss": 0.0003, + "num_tokens": 928042.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 57.851851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025414487347006798, + "kl": 0.004055418074131012, + "learning_rate": 2.2923333333333335e-06, + "loss": 0.0002, + "num_tokens": 928252.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 57.870370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060030724853277206, + "kl": 0.004811800085008144, + "learning_rate": 2.2920000000000002e-06, + "loss": 0.0002, + "num_tokens": 928571.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 57.888888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.39573654532432556, + "kl": 0.023475729685742408, + "learning_rate": 2.2916666666666666e-06, + "loss": 0.0014, + "num_tokens": 928836.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 36.75, + "completions/mean_terminated_length": 36.75, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 57.907407407407405, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.92252779006958, + "kl": 0.037057699635624886, + "learning_rate": 2.2913333333333334e-06, + "loss": -0.0153, + "num_tokens": 929207.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 3127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 57.925925925925924, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.137658596038818, + "kl": 0.04328920692205429, + "learning_rate": 2.291e-06, + "loss": -0.1235, + "num_tokens": 929529.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 3128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 57.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050064437091350555, + "kl": 0.0074186623096466064, + "learning_rate": 2.2906666666666665e-06, + "loss": 0.0004, + "num_tokens": 929797.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 57.96296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9779375791549683, + "kl": 0.014265389880165458, + "learning_rate": 2.2903333333333333e-06, + "loss": -0.0853, + "num_tokens": 930140.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 3130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 57.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06911390274763107, + "kl": 0.15921716392040253, + "learning_rate": 2.29e-06, + "loss": 0.008, + "num_tokens": 930450.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 58.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0021713341120630503, + "kl": 0.0034520328044891357, + "learning_rate": 2.289666666666667e-06, + "loss": 0.0002, + "num_tokens": 930686.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 58.01851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2619903087615967, + "kl": 0.13335754722356796, + "learning_rate": 2.2893333333333332e-06, + "loss": -0.0579, + "num_tokens": 930983.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 3133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 58.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18223130702972412, + "kl": 0.03682851418852806, + "learning_rate": 2.289e-06, + "loss": 0.0019, + "num_tokens": 931297.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 58.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16124214231967926, + "kl": 0.02562273107469082, + "learning_rate": 2.2886666666666664e-06, + "loss": 0.0014, + "num_tokens": 931591.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 58.074074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048398155719041824, + "kl": 0.0190586696844548, + "learning_rate": 2.2883333333333336e-06, + "loss": 0.001, + "num_tokens": 931869.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 58.092592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03625788167119026, + "kl": 0.0075961456168442965, + "learning_rate": 2.2880000000000004e-06, + "loss": 0.0004, + "num_tokens": 932156.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 58.111111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03786690533161163, + "kl": 0.10377553105354309, + "learning_rate": 2.2876666666666667e-06, + "loss": 0.0053, + "num_tokens": 932524.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 58.129629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03660387173295021, + "kl": 0.011968305916525424, + "learning_rate": 2.2873333333333335e-06, + "loss": 0.0006, + "num_tokens": 932810.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 58.148148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0543820858001709, + "kl": 0.002412225818261504, + "learning_rate": 2.287e-06, + "loss": 0.0001, + "num_tokens": 933130.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 58.166666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023981111124157906, + "kl": 0.0027115034172311425, + "learning_rate": 2.2866666666666667e-06, + "loss": 0.0001, + "num_tokens": 933426.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 58.18518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03546467795968056, + "kl": 0.00028955191373825073, + "learning_rate": 2.2863333333333334e-06, + "loss": 0.0, + "num_tokens": 933638.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 58.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003263151040300727, + "kl": 0.00019640723621705547, + "learning_rate": 2.2860000000000002e-06, + "loss": 0.0, + "num_tokens": 933950.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 58.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09375699609518051, + "kl": 0.010075107216835022, + "learning_rate": 2.2856666666666666e-06, + "loss": 0.0005, + "num_tokens": 934166.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 58.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10350469499826431, + "kl": 0.03227981645613909, + "learning_rate": 2.2853333333333334e-06, + "loss": 0.0016, + "num_tokens": 934442.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 58.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021121809259057045, + "kl": 0.007212073542177677, + "learning_rate": 2.285e-06, + "loss": 0.0004, + "num_tokens": 934748.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 58.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05517319589853287, + "kl": 0.011987493140622973, + "learning_rate": 2.2846666666666665e-06, + "loss": 0.0006, + "num_tokens": 935083.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 58.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12442062050104141, + "kl": 0.018907058984041214, + "learning_rate": 2.2843333333333333e-06, + "loss": 0.001, + "num_tokens": 935371.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 58.31481481481482, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.6243205070495605, + "kl": 0.04667986184358597, + "learning_rate": 2.284e-06, + "loss": 0.0016, + "num_tokens": 935711.0, + "reward": 4.625, + "reward_std": 2.25, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 2.25, + "step": 3149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 58.333333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15774016082286835, + "kl": 0.16984020173549652, + "learning_rate": 2.283666666666667e-06, + "loss": 0.0085, + "num_tokens": 936021.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 58.351851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032631825655698776, + "kl": 0.0012128648231737316, + "learning_rate": 2.2833333333333332e-06, + "loss": 0.0001, + "num_tokens": 936270.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 58.370370370370374, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.017707347869873, + "kl": 0.07463878626003861, + "learning_rate": 2.283e-06, + "loss": -0.1594, + "num_tokens": 936542.0, + "reward": 7.625, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.625, + "rewards/reward_combined/std": 0.25, + "step": 3152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 58.388888888888886, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.816429615020752, + "kl": 0.030274469638243318, + "learning_rate": 2.2826666666666668e-06, + "loss": 0.0935, + "num_tokens": 936825.0, + "reward": 5.875, + "reward_std": 3.25, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 3.25, + "step": 3153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 58.407407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10214201360940933, + "kl": 0.0058991871774196625, + "learning_rate": 2.2823333333333336e-06, + "loss": 0.0003, + "num_tokens": 937085.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 58.425925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006219821982085705, + "kl": 0.0007234037038870156, + "learning_rate": 2.2820000000000003e-06, + "loss": 0.0, + "num_tokens": 937345.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.5, + "completions/mean_terminated_length": 4.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 58.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03278283029794693, + "kl": 0.003994007944129407, + "learning_rate": 2.2816666666666667e-06, + "loss": 0.0002, + "num_tokens": 937559.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 58.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007297915406525135, + "kl": 0.00059682727442123, + "learning_rate": 2.2813333333333335e-06, + "loss": 0.0, + "num_tokens": 937779.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 58.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10067549347877502, + "kl": 0.008874677121639252, + "learning_rate": 2.281e-06, + "loss": 0.0005, + "num_tokens": 938109.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 37.25, + "completions/mean_terminated_length": 37.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 58.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5791587829589844, + "kl": 0.060645993798971176, + "learning_rate": 2.2806666666666666e-06, + "loss": 0.1261, + "num_tokens": 938482.0, + "reward": 4.625, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 3159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 58.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.38537588715553284, + "kl": 0.04332096315920353, + "learning_rate": 2.2803333333333334e-06, + "loss": 0.0021, + "num_tokens": 938813.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 85.75, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 85.75, + "completions/mean_terminated_length": 29.0, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 58.53703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.909480094909668, + "kl": 0.015507086180150509, + "learning_rate": 2.28e-06, + "loss": 0.228, + "num_tokens": 939408.0, + "reward": 3.924999952316284, + "reward_std": 4.660739898681641, + "rewards/reward_combined/mean": 3.924999952316284, + "rewards/reward_combined/std": 4.660740375518799, + "step": 3161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 58.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04581165686249733, + "kl": 0.013135399203747511, + "learning_rate": 2.2796666666666666e-06, + "loss": 0.0007, + "num_tokens": 939714.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 58.574074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06691695749759674, + "kl": 0.010791434440761805, + "learning_rate": 2.2793333333333333e-06, + "loss": 0.0006, + "num_tokens": 940040.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 58.592592592592595, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.299450397491455, + "kl": 0.23470115661621094, + "learning_rate": 2.279e-06, + "loss": -0.0392, + "num_tokens": 940374.0, + "reward": 5.25, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 2.598076105117798, + "step": 3164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 58.611111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002639294834807515, + "kl": 0.0015660664066672325, + "learning_rate": 2.2786666666666665e-06, + "loss": 0.0001, + "num_tokens": 940686.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 58.629629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0319683738052845, + "kl": 0.0013311710208654404, + "learning_rate": 2.2783333333333337e-06, + "loss": 0.0001, + "num_tokens": 940953.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.006410256493836641, + "clip_ratio/low_min": 0.006410256493836641, + "clip_ratio/region_mean": 0.006410256493836641, + "completion_length": 35.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 35.25, + "completions/mean_terminated_length": 35.25, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 58.648148148148145, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.21768856048584, + "kl": 0.06081162393093109, + "learning_rate": 2.278e-06, + "loss": 0.0018, + "num_tokens": 941322.0, + "reward": 6.75, + "reward_std": 2.1794495582580566, + "rewards/reward_combined/mean": 6.75, + "rewards/reward_combined/std": 2.1794495582580566, + "step": 3167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 58.666666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00885734986513853, + "kl": 0.000110626220703125, + "learning_rate": 2.277666666666667e-06, + "loss": 0.0, + "num_tokens": 941534.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 58.68518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05073962360620499, + "kl": 0.0059238689718768, + "learning_rate": 2.277333333333333e-06, + "loss": 0.0003, + "num_tokens": 941823.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 29.25, + "completions/mean_terminated_length": 29.25, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 58.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2143610715866089, + "kl": 0.07582800090312958, + "learning_rate": 2.277e-06, + "loss": 0.0038, + "num_tokens": 942220.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 3170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 58.72222222222222, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7463741302490234, + "kl": 0.038494925014674664, + "learning_rate": 2.2766666666666668e-06, + "loss": 0.1325, + "num_tokens": 942489.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 3171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 58.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04515834525227547, + "kl": 0.0058713669423013926, + "learning_rate": 2.2763333333333335e-06, + "loss": 0.0003, + "num_tokens": 942771.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 47.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 47.0, + "completions/mean_terminated_length": 47.0, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 58.75925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.432664155960083, + "kl": 0.03460339084267616, + "learning_rate": 2.2760000000000003e-06, + "loss": -0.0464, + "num_tokens": 943179.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 3173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 58.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002696047071367502, + "kl": 0.0033345669507980347, + "learning_rate": 2.2756666666666667e-06, + "loss": 0.0002, + "num_tokens": 943415.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 58.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0314471498131752, + "kl": 0.26328007876873016, + "learning_rate": 2.2753333333333335e-06, + "loss": 0.0132, + "num_tokens": 943719.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 58.81481481481482, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.275594711303711, + "kl": 0.15882434137165546, + "learning_rate": 2.275e-06, + "loss": 0.0333, + "num_tokens": 944035.0, + "reward": 4.0, + "reward_std": 4.690415859222412, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 4.690415859222412, + "step": 3176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 58.833333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00265194708481431, + "kl": 0.01631934382021427, + "learning_rate": 2.2746666666666666e-06, + "loss": 0.0008, + "num_tokens": 944295.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 58.851851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01292788702994585, + "kl": 0.00037716925726272166, + "learning_rate": 2.2743333333333334e-06, + "loss": 0.0, + "num_tokens": 944551.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 58.870370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009411151404492557, + "kl": 0.0012224827660247684, + "learning_rate": 2.274e-06, + "loss": 0.0001, + "num_tokens": 944831.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 58.888888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003616356698330492, + "kl": 3.196299076080322e-05, + "learning_rate": 2.2736666666666665e-06, + "loss": 0.0, + "num_tokens": 945051.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 3180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 58.907407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010120888240635395, + "kl": 0.000500300811836496, + "learning_rate": 2.2733333333333333e-06, + "loss": 0.0, + "num_tokens": 945286.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 58.925925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015457162633538246, + "kl": 0.0005570801731664687, + "learning_rate": 2.273e-06, + "loss": 0.0, + "num_tokens": 945548.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 58.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07276313751935959, + "kl": 0.01879236288368702, + "learning_rate": 2.272666666666667e-06, + "loss": 0.001, + "num_tokens": 945876.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 58.96296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.651898384094238, + "kl": 0.020251495763659477, + "learning_rate": 2.2723333333333337e-06, + "loss": 0.1915, + "num_tokens": 946146.0, + "reward": 7.0, + "reward_std": 1.0, + "rewards/reward_combined/mean": 7.0, + "rewards/reward_combined/std": 1.0, + "step": 3184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 58.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03362635523080826, + "kl": 0.0059560188092291355, + "learning_rate": 2.272e-06, + "loss": 0.0003, + "num_tokens": 946419.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 59.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09272552281618118, + "kl": 0.012921370565891266, + "learning_rate": 2.271666666666667e-06, + "loss": 0.0006, + "num_tokens": 946689.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 59.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1340102106332779, + "kl": 0.027555877342820168, + "learning_rate": 2.271333333333333e-06, + "loss": 0.0014, + "num_tokens": 946964.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 59.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07986051589250565, + "kl": 0.005420433357357979, + "learning_rate": 2.271e-06, + "loss": 0.0003, + "num_tokens": 947262.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 59.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027396192774176598, + "kl": 0.00856949482113123, + "learning_rate": 2.2706666666666667e-06, + "loss": 0.0004, + "num_tokens": 947581.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 59.074074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002769162179902196, + "kl": 0.01630319282412529, + "learning_rate": 2.2703333333333335e-06, + "loss": 0.0008, + "num_tokens": 947841.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 59.092592592592595, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.268100738525391, + "kl": 0.008197366842068732, + "learning_rate": 2.2700000000000003e-06, + "loss": 0.0559, + "num_tokens": 948170.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 3191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 28.25, + "completions/mean_terminated_length": 28.25, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 59.111111111111114, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8832625150680542, + "kl": 0.30652645975351334, + "learning_rate": 2.2696666666666666e-06, + "loss": -0.0451, + "num_tokens": 948535.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 3192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 59.129629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012851247563958168, + "kl": 0.004570577992126346, + "learning_rate": 2.2693333333333334e-06, + "loss": 0.0002, + "num_tokens": 948825.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 59.148148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040948763489723206, + "kl": 0.10290932282805443, + "learning_rate": 2.269e-06, + "loss": 0.0051, + "num_tokens": 949193.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 59.166666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006165982689708471, + "kl": 0.0004742443561553955, + "learning_rate": 2.2686666666666666e-06, + "loss": 0.0, + "num_tokens": 949453.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 59.18518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016570178791880608, + "kl": 0.0006027974013704807, + "learning_rate": 2.2683333333333334e-06, + "loss": 0.0, + "num_tokens": 949687.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 59.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13422216475009918, + "kl": 0.010207831393927336, + "learning_rate": 2.268e-06, + "loss": 0.0005, + "num_tokens": 950013.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 59.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17172186076641083, + "kl": 0.03238068986684084, + "learning_rate": 2.267666666666667e-06, + "loss": 0.0017, + "num_tokens": 950302.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 59.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040401242673397064, + "kl": 0.0015272833406925201, + "learning_rate": 2.2673333333333333e-06, + "loss": 0.0001, + "num_tokens": 950562.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 59.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007999102585017681, + "kl": 0.002675756812095642, + "learning_rate": 2.267e-06, + "loss": 0.0001, + "num_tokens": 950778.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 59.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01074859406799078, + "kl": 0.0001471191644668579, + "learning_rate": 2.266666666666667e-06, + "loss": 0.0, + "num_tokens": 950990.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 59.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.44013193249702454, + "kl": 0.04921358870342374, + "learning_rate": 2.2663333333333336e-06, + "loss": 0.0026, + "num_tokens": 951280.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 59.31481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07076862454414368, + "kl": 0.034902628511190414, + "learning_rate": 2.266e-06, + "loss": 0.0017, + "num_tokens": 951584.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 6.0, + "completions/mean_terminated_length": 6.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 59.333333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0355362631380558, + "kl": 0.0015573574928566813, + "learning_rate": 2.2656666666666668e-06, + "loss": 0.0001, + "num_tokens": 951808.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 59.351851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03428663685917854, + "kl": 0.15743301808834076, + "learning_rate": 2.265333333333333e-06, + "loss": 0.0079, + "num_tokens": 952117.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 59.370370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033244941383600235, + "kl": 0.00034081190824508667, + "learning_rate": 2.265e-06, + "loss": 0.0, + "num_tokens": 952329.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 59.388888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02193329483270645, + "kl": 0.0031790193170309067, + "learning_rate": 2.2646666666666667e-06, + "loss": 0.0002, + "num_tokens": 952641.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 59.407407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003524729108903557, + "kl": 2.9958784580230713e-05, + "learning_rate": 2.2643333333333335e-06, + "loss": 0.0, + "num_tokens": 952861.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 3208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 27.25, + "completions/mean_terminated_length": 27.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 59.425925925925924, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1891849040985107, + "kl": 0.04719813913106918, + "learning_rate": 2.2640000000000003e-06, + "loss": 0.0263, + "num_tokens": 953190.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 3209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 59.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04561041295528412, + "kl": 0.006037738639861345, + "learning_rate": 2.2636666666666666e-06, + "loss": 0.0003, + "num_tokens": 953472.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 33.75, + "completions/mean_terminated_length": 33.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 59.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06946126371622086, + "kl": 0.02700081653892994, + "learning_rate": 2.2633333333333334e-06, + "loss": 0.0014, + "num_tokens": 953831.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 59.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053101733326911926, + "kl": 0.002790637663565576, + "learning_rate": 2.2629999999999998e-06, + "loss": 0.0001, + "num_tokens": 954100.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 24.25, + "completions/mean_terminated_length": 24.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 59.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026106122881174088, + "kl": 0.051188673824071884, + "learning_rate": 2.262666666666667e-06, + "loss": 0.0026, + "num_tokens": 954433.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 59.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038653019815683365, + "kl": 0.2619563937187195, + "learning_rate": 2.2623333333333333e-06, + "loss": 0.0131, + "num_tokens": 954737.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 59.53703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.027862071990967, + "kl": 0.07980869337916374, + "learning_rate": 2.262e-06, + "loss": -0.0413, + "num_tokens": 955059.0, + "reward": 3.875, + "reward_std": 2.688710927963257, + "rewards/reward_combined/mean": 3.875, + "rewards/reward_combined/std": 2.688710927963257, + "step": 3215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 59.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058128684759140015, + "kl": 0.01026546536013484, + "learning_rate": 2.261666666666667e-06, + "loss": 0.0005, + "num_tokens": 955383.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 59.574074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0051583899185061455, + "kl": 0.00016373396283597685, + "learning_rate": 2.2613333333333333e-06, + "loss": 0.0, + "num_tokens": 955639.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 28.0, + "completions/mean_terminated_length": 28.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 59.592592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08959463238716125, + "kl": 0.023373776115477085, + "learning_rate": 2.261e-06, + "loss": 0.0012, + "num_tokens": 955979.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 59.611111111111114, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.224197864532471, + "kl": 0.022706760093569756, + "learning_rate": 2.260666666666667e-06, + "loss": 0.0558, + "num_tokens": 956288.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 3219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 29.75, + "completions/mean_terminated_length": 29.75, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 59.629629629629626, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.607232093811035, + "kl": 0.19994370639324188, + "learning_rate": 2.2603333333333336e-06, + "loss": -0.0326, + "num_tokens": 956687.0, + "reward": 2.375, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 1.25, + "step": 3220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 59.648148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01402705255895853, + "kl": 0.00022174417972564697, + "learning_rate": 2.26e-06, + "loss": 0.0, + "num_tokens": 956891.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 59.666666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009804409928619862, + "kl": 0.0011819813516922295, + "learning_rate": 2.2596666666666667e-06, + "loss": 0.0001, + "num_tokens": 957171.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 59.68518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09625894576311111, + "kl": 0.014301734045147896, + "learning_rate": 2.259333333333333e-06, + "loss": 0.0008, + "num_tokens": 957455.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 59.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031403396278619766, + "kl": 0.0014009957667440176, + "learning_rate": 2.259e-06, + "loss": 0.0001, + "num_tokens": 957760.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 59.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1456184685230255, + "kl": 0.011133139487355947, + "learning_rate": 2.2586666666666667e-06, + "loss": 0.0005, + "num_tokens": 958034.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 29.25, + "completions/mean_terminated_length": 29.25, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 59.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14911921322345734, + "kl": 0.06302820518612862, + "learning_rate": 2.2583333333333335e-06, + "loss": 0.0032, + "num_tokens": 958379.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3226 + }, + { + "clip_ratio/high_max": 0.009615384973585606, + "clip_ratio/high_mean": 0.009615384973585606, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009615384973585606, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 59.75925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.399496555328369, + "kl": 0.08771447464823723, + "learning_rate": 2.2580000000000002e-06, + "loss": 0.0873, + "num_tokens": 958701.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 3227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 59.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04860496520996094, + "kl": 0.008628547424450517, + "learning_rate": 2.2576666666666666e-06, + "loss": 0.0004, + "num_tokens": 959007.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 59.7962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.529590368270874, + "kl": 0.09773006429895759, + "learning_rate": 2.2573333333333334e-06, + "loss": 0.0008, + "num_tokens": 959267.0, + "reward": 2.0, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.0, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 3229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 59.81481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03229386731982231, + "kl": 0.007061955519020557, + "learning_rate": 2.257e-06, + "loss": 0.0004, + "num_tokens": 959572.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 59.833333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0624580979347229, + "kl": 0.012530050007626414, + "learning_rate": 2.256666666666667e-06, + "loss": 0.0007, + "num_tokens": 959858.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 59.851851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0475514717400074, + "kl": 0.0035444003297016025, + "learning_rate": 2.2563333333333333e-06, + "loss": 0.0002, + "num_tokens": 960124.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 59.870370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033293817192316055, + "kl": 0.007924523204565048, + "learning_rate": 2.256e-06, + "loss": 0.0004, + "num_tokens": 960398.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 59.888888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033901579678058624, + "kl": 0.0005332790315151215, + "learning_rate": 2.255666666666667e-06, + "loss": 0.0, + "num_tokens": 960642.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 59.907407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002517654560506344, + "kl": 0.0033646076917648315, + "learning_rate": 2.2553333333333332e-06, + "loss": 0.0002, + "num_tokens": 960878.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 59.925925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21926459670066833, + "kl": 0.015592428855597973, + "learning_rate": 2.255e-06, + "loss": 0.0008, + "num_tokens": 961146.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 59.94444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7518670558929443, + "kl": 0.022783292457461357, + "learning_rate": 2.254666666666667e-06, + "loss": 0.141, + "num_tokens": 961431.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 3237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.75, + "completions/mean_terminated_length": 11.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 59.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02968648634850979, + "kl": 0.004818763351067901, + "learning_rate": 2.2543333333333336e-06, + "loss": 0.0002, + "num_tokens": 961706.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 59.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026802120730280876, + "kl": 0.0010022903443314135, + "learning_rate": 2.254e-06, + "loss": 0.0, + "num_tokens": 961976.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 60.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027512365952134132, + "kl": 0.0018479927675798535, + "learning_rate": 2.2536666666666667e-06, + "loss": 0.0001, + "num_tokens": 962301.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 27.75, + "completions/mean_terminated_length": 27.75, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 60.01851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.512545108795166, + "kl": 0.0603331383317709, + "learning_rate": 2.253333333333333e-06, + "loss": 0.0792, + "num_tokens": 962632.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 3241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 60.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007467227056622505, + "kl": 0.0010834246641024947, + "learning_rate": 2.253e-06, + "loss": 0.0001, + "num_tokens": 962892.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 60.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023858018219470978, + "kl": 0.0012014210224151611, + "learning_rate": 2.252666666666667e-06, + "loss": 0.0001, + "num_tokens": 963104.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3243 + }, + { + "clip_ratio/high_max": 0.007352941203862429, + "clip_ratio/high_mean": 0.007352941203862429, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007352941203862429, + "completion_length": 32.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 32.75, + "completions/mean_terminated_length": 32.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 60.074074074074076, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4764111042022705, + "kl": 0.035196663811802864, + "learning_rate": 2.2523333333333334e-06, + "loss": 0.0079, + "num_tokens": 963515.0, + "reward": 2.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 0.25, + "step": 3244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 60.092592592592595, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.319875717163086, + "kl": 0.04025369882583618, + "learning_rate": 2.252e-06, + "loss": 0.3456, + "num_tokens": 963825.0, + "reward": 4.125, + "reward_std": 3.902456521987915, + "rewards/reward_combined/mean": 4.125, + "rewards/reward_combined/std": 3.902456521987915, + "step": 3245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 60.111111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11474195867776871, + "kl": 0.005463895387947559, + "learning_rate": 2.2516666666666666e-06, + "loss": 0.0003, + "num_tokens": 964101.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 60.129629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03988216072320938, + "kl": 0.2616236209869385, + "learning_rate": 2.2513333333333333e-06, + "loss": 0.0131, + "num_tokens": 964405.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 30.25, + "completions/mean_terminated_length": 30.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 60.148148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07153337448835373, + "kl": 0.019581732340157032, + "learning_rate": 2.251e-06, + "loss": 0.001, + "num_tokens": 964754.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 60.166666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002549492521211505, + "kl": 0.003363586962223053, + "learning_rate": 2.250666666666667e-06, + "loss": 0.0002, + "num_tokens": 964990.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 60.18518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009882776066660881, + "kl": 0.00037163496017456055, + "learning_rate": 2.2503333333333333e-06, + "loss": 0.0, + "num_tokens": 965234.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 60.2037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.1212263107299805, + "kl": 0.060780106112360954, + "learning_rate": 2.25e-06, + "loss": 0.0243, + "num_tokens": 965525.0, + "reward": 4.625, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 3251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 40.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 40.5, + "completions/mean_terminated_length": 40.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 60.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05679375305771828, + "kl": 0.017526951618492603, + "learning_rate": 2.249666666666667e-06, + "loss": 0.0007, + "num_tokens": 965907.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 60.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04464339464902878, + "kl": 0.002002660185098648, + "learning_rate": 2.249333333333333e-06, + "loss": 0.0001, + "num_tokens": 966167.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.5, + "completions/mean_terminated_length": 5.5, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 60.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056290287524461746, + "kl": 0.0019052649586228654, + "learning_rate": 2.249e-06, + "loss": 0.0001, + "num_tokens": 966389.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 60.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17134232819080353, + "kl": 0.02606994565576315, + "learning_rate": 2.2486666666666668e-06, + "loss": 0.0013, + "num_tokens": 966659.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 7.5, + "completions/mean_terminated_length": 7.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 60.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.3570164442062378, + "kl": 0.1328824907541275, + "learning_rate": 2.2483333333333335e-06, + "loss": 0.0059, + "num_tokens": 966889.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 60.31481481481482, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.425131320953369, + "kl": 0.02823589649051428, + "learning_rate": 2.248e-06, + "loss": 0.0956, + "num_tokens": 967183.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 3257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 60.333333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013110277242958546, + "kl": 0.004298093728721142, + "learning_rate": 2.2476666666666667e-06, + "loss": 0.0002, + "num_tokens": 967456.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 60.351851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011147416196763515, + "kl": 0.0010892586433328688, + "learning_rate": 2.247333333333333e-06, + "loss": 0.0001, + "num_tokens": 967724.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 60.370370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03502470627427101, + "kl": 0.00041371583938598633, + "learning_rate": 2.2470000000000003e-06, + "loss": 0.0, + "num_tokens": 967936.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 60.388888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4776037931442261, + "kl": 0.05135340057313442, + "learning_rate": 2.246666666666667e-06, + "loss": 0.0029, + "num_tokens": 968214.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 60.407407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04923976585268974, + "kl": 0.15730884671211243, + "learning_rate": 2.2463333333333334e-06, + "loss": 0.0079, + "num_tokens": 968523.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 33.25, + "completions/mean_terminated_length": 33.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 60.425925925925924, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1604881286621094, + "kl": 0.06634473241865635, + "learning_rate": 2.246e-06, + "loss": 0.0439, + "num_tokens": 968880.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 3263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 60.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09133684635162354, + "kl": 0.011883596307598054, + "learning_rate": 2.2456666666666665e-06, + "loss": 0.0006, + "num_tokens": 969151.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 60.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002457198454067111, + "kl": 0.01634630560874939, + "learning_rate": 2.2453333333333333e-06, + "loss": 0.0008, + "num_tokens": 969411.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3265 + }, + { + "clip_ratio/high_max": 0.009433962404727936, + "clip_ratio/high_mean": 0.009433962404727936, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009433962404727936, + "completion_length": 32.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 32.25, + "completions/mean_terminated_length": 32.25, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 60.48148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2356605529785156, + "kl": 0.16095459461212158, + "learning_rate": 2.245e-06, + "loss": -0.0205, + "num_tokens": 969768.0, + "reward": 6.625, + "reward_std": 2.428133726119995, + "rewards/reward_combined/mean": 6.625, + "rewards/reward_combined/std": 2.428133726119995, + "step": 3266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 60.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3584771752357483, + "kl": 0.061701999977231026, + "learning_rate": 2.244666666666667e-06, + "loss": 0.0039, + "num_tokens": 970052.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 60.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07596871256828308, + "kl": 0.012241472955793142, + "learning_rate": 2.2443333333333332e-06, + "loss": 0.0006, + "num_tokens": 970336.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 30.25, + "completions/mean_terminated_length": 30.25, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 60.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10051976144313812, + "kl": 0.03433547355234623, + "learning_rate": 2.244e-06, + "loss": 0.0017, + "num_tokens": 970673.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 60.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003291579778306186, + "kl": 6.13480806350708e-05, + "learning_rate": 2.243666666666667e-06, + "loss": 0.0, + "num_tokens": 970893.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 3270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 60.574074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18302927911281586, + "kl": 0.030282115563750267, + "learning_rate": 2.243333333333333e-06, + "loss": 0.0014, + "num_tokens": 971187.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 60.592592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12607116997241974, + "kl": 0.011203007772564888, + "learning_rate": 2.243e-06, + "loss": 0.0006, + "num_tokens": 971515.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 60.611111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041766345500946045, + "kl": 0.005099525908008218, + "learning_rate": 2.2426666666666667e-06, + "loss": 0.0003, + "num_tokens": 971799.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 60.629629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008860925445333123, + "kl": 0.0012018127599731088, + "learning_rate": 2.2423333333333335e-06, + "loss": 0.0001, + "num_tokens": 972079.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 24.25, + "completions/mean_terminated_length": 24.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 60.648148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034295372664928436, + "kl": 0.05643720179796219, + "learning_rate": 2.242e-06, + "loss": 0.0028, + "num_tokens": 972412.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 60.666666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01388558465987444, + "kl": 0.00814725086092949, + "learning_rate": 2.2416666666666667e-06, + "loss": 0.0004, + "num_tokens": 972738.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 60.68518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05313456058502197, + "kl": 0.020086459815502167, + "learning_rate": 2.2413333333333334e-06, + "loss": 0.001, + "num_tokens": 973009.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 60.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03039192594587803, + "kl": 0.0003601104181143455, + "learning_rate": 2.2410000000000002e-06, + "loss": 0.0, + "num_tokens": 973265.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 60.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1424340009689331, + "kl": 0.030056262388825417, + "learning_rate": 2.240666666666667e-06, + "loss": 0.0014, + "num_tokens": 973567.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 60.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021793438121676445, + "kl": 0.0023846477270126343, + "learning_rate": 2.2403333333333334e-06, + "loss": 0.0001, + "num_tokens": 973775.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 60.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013799660839140415, + "kl": 0.0026278942823410034, + "learning_rate": 2.24e-06, + "loss": 0.0001, + "num_tokens": 974071.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 7.0, + "completions/mean_terminated_length": 7.0, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 60.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00799830537289381, + "kl": 0.00044744781916961074, + "learning_rate": 2.2396666666666665e-06, + "loss": 0.0, + "num_tokens": 974307.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 60.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.34845107793807983, + "kl": 0.03930492326617241, + "learning_rate": 2.2393333333333333e-06, + "loss": 0.0019, + "num_tokens": 974648.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 60.81481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10607807338237762, + "kl": 0.008225849131122231, + "learning_rate": 2.239e-06, + "loss": 0.0004, + "num_tokens": 974908.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 60.833333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0732354000210762, + "kl": 0.013455578591674566, + "learning_rate": 2.238666666666667e-06, + "loss": 0.0007, + "num_tokens": 975238.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 60.851851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08038192987442017, + "kl": 0.010605439194478095, + "learning_rate": 2.2383333333333332e-06, + "loss": 0.0006, + "num_tokens": 975570.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 60.870370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09279978275299072, + "kl": 0.08303552120923996, + "learning_rate": 2.238e-06, + "loss": 0.0042, + "num_tokens": 975936.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 60.888888888888886, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.126615047454834, + "kl": 0.06999421655200422, + "learning_rate": 2.2376666666666668e-06, + "loss": 0.0933, + "num_tokens": 976235.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 3288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 60.907407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022675970569252968, + "kl": 0.0011477507650852203, + "learning_rate": 2.237333333333333e-06, + "loss": 0.0001, + "num_tokens": 976547.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 60.925925925925924, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.521695137023926, + "kl": 0.052976700535509735, + "learning_rate": 2.2370000000000004e-06, + "loss": 0.0186, + "num_tokens": 976865.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 3290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 60.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.36096513271331787, + "kl": 0.030867554945871234, + "learning_rate": 2.2366666666666667e-06, + "loss": 0.0016, + "num_tokens": 977174.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 60.96296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9423773288726807, + "kl": 0.026548580965027213, + "learning_rate": 2.2363333333333335e-06, + "loss": 0.0134, + "num_tokens": 977480.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 3292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 60.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055974461138248444, + "kl": 0.0312656294554472, + "learning_rate": 2.236e-06, + "loss": 0.0016, + "num_tokens": 977778.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 61.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07440923154354095, + "kl": 0.005075130713521503, + "learning_rate": 2.2356666666666666e-06, + "loss": 0.0002, + "num_tokens": 978087.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 29.0, + "completions/mean_terminated_length": 29.0, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 61.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14330555498600006, + "kl": 0.019268010277301073, + "learning_rate": 2.2353333333333334e-06, + "loss": 0.001, + "num_tokens": 978419.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 61.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1167730987071991, + "kl": 0.012032601051032543, + "learning_rate": 2.235e-06, + "loss": 0.0006, + "num_tokens": 978695.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 61.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022528642788529396, + "kl": 0.0007245764136314392, + "learning_rate": 2.234666666666667e-06, + "loss": 0.0, + "num_tokens": 978955.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 61.074074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015033102594316006, + "kl": 0.0042897912207990885, + "learning_rate": 2.2343333333333333e-06, + "loss": 0.0002, + "num_tokens": 979228.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 61.092592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059593863785266876, + "kl": 0.027693637646734715, + "learning_rate": 2.234e-06, + "loss": 0.0015, + "num_tokens": 979517.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 61.111111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019043680280447006, + "kl": 0.04799039848148823, + "learning_rate": 2.2336666666666665e-06, + "loss": 0.0024, + "num_tokens": 979849.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 61.129629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016476990655064583, + "kl": 0.00288031913805753, + "learning_rate": 2.2333333333333333e-06, + "loss": 0.0002, + "num_tokens": 980151.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 61.148148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04146501421928406, + "kl": 0.2614165246486664, + "learning_rate": 2.233e-06, + "loss": 0.0131, + "num_tokens": 980455.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 61.166666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04730117693543434, + "kl": 0.01096729189157486, + "learning_rate": 2.232666666666667e-06, + "loss": 0.0005, + "num_tokens": 980754.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 61.18518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11177375912666321, + "kl": 0.011237940285354853, + "learning_rate": 2.232333333333333e-06, + "loss": 0.0006, + "num_tokens": 981015.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 61.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004694729577749968, + "kl": 0.0006412813963834196, + "learning_rate": 2.232e-06, + "loss": 0.0, + "num_tokens": 981231.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.25, + "completions/mean_terminated_length": 5.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 61.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08639340102672577, + "kl": 0.004216181579977274, + "learning_rate": 2.2316666666666668e-06, + "loss": 0.0002, + "num_tokens": 981452.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 61.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.2088747024536133, + "kl": 0.16476541478186846, + "learning_rate": 2.2313333333333335e-06, + "loss": 0.0078, + "num_tokens": 981729.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 61.25925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.663725852966309, + "kl": 0.025627458177041262, + "learning_rate": 2.2310000000000003e-06, + "loss": 0.0955, + "num_tokens": 982010.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 3308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 61.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01842394471168518, + "kl": 0.00127878796774894, + "learning_rate": 2.2306666666666667e-06, + "loss": 0.0001, + "num_tokens": 982333.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 61.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03290437534451485, + "kl": 0.0007133185972634237, + "learning_rate": 2.2303333333333335e-06, + "loss": 0.0, + "num_tokens": 982589.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 61.31481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09576458483934402, + "kl": 0.015226204879581928, + "learning_rate": 2.23e-06, + "loss": 0.0008, + "num_tokens": 982877.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 27.0, + "completions/mean_terminated_length": 27.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 61.333333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2942647933959961, + "kl": 0.05347330495715141, + "learning_rate": 2.2296666666666666e-06, + "loss": 0.0028, + "num_tokens": 983205.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 61.351851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005737815517932177, + "kl": 0.0014646127820014954, + "learning_rate": 2.2293333333333334e-06, + "loss": 0.0001, + "num_tokens": 983421.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 61.370370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020061105489730835, + "kl": 0.0006044578040018678, + "learning_rate": 2.229e-06, + "loss": 0.0, + "num_tokens": 983656.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 35.25, + "completions/mean_terminated_length": 35.25, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 61.388888888888886, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.012406587600708, + "kl": 0.06878912821412086, + "learning_rate": 2.228666666666667e-06, + "loss": -0.0222, + "num_tokens": 984025.0, + "reward": 4.875, + "reward_std": 3.75, + "rewards/reward_combined/mean": 4.875, + "rewards/reward_combined/std": 3.75, + "step": 3315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 61.407407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0343567430973053, + "kl": 0.006915203528478742, + "learning_rate": 2.2283333333333333e-06, + "loss": 0.0003, + "num_tokens": 984293.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 61.425925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12580174207687378, + "kl": 0.028369064209982753, + "learning_rate": 2.228e-06, + "loss": 0.0013, + "num_tokens": 984615.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 61.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018884699791669846, + "kl": 0.000634184674709104, + "learning_rate": 2.2276666666666665e-06, + "loss": 0.0, + "num_tokens": 984928.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 61.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006954753189347684, + "kl": 0.0012185658561065793, + "learning_rate": 2.2273333333333332e-06, + "loss": 0.0001, + "num_tokens": 985208.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 61.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003398414410185069, + "kl": 5.6333839893341064e-05, + "learning_rate": 2.227e-06, + "loss": 0.0, + "num_tokens": 985428.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 3320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 84.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 84.0, + "completions/mean_terminated_length": 26.666667938232422, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 61.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.566087484359741, + "kl": 0.04127374291419983, + "learning_rate": 2.226666666666667e-06, + "loss": 0.5316, + "num_tokens": 986016.0, + "reward": 7.175000190734863, + "reward_std": 0.39475739002227783, + "rewards/reward_combined/mean": 7.175000190734863, + "rewards/reward_combined/std": 0.39475739002227783, + "step": 3321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 61.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018250636756420135, + "kl": 0.002339041791856289, + "learning_rate": 2.2263333333333336e-06, + "loss": 0.0001, + "num_tokens": 986328.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 61.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031394533812999725, + "kl": 0.00028486549854278564, + "learning_rate": 2.226e-06, + "loss": 0.0, + "num_tokens": 986540.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 61.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03656277433037758, + "kl": 0.0016409651725552976, + "learning_rate": 2.2256666666666667e-06, + "loss": 0.0001, + "num_tokens": 986807.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 61.574074074074076, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.334615707397461, + "kl": 0.028304174542427063, + "learning_rate": 2.2253333333333335e-06, + "loss": -0.0167, + "num_tokens": 987105.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 3325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 61.592592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0022786613553762436, + "kl": 0.0033953189849853516, + "learning_rate": 2.2250000000000003e-06, + "loss": 0.0002, + "num_tokens": 987341.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 61.611111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02110440470278263, + "kl": 0.0031103537185117602, + "learning_rate": 2.2246666666666667e-06, + "loss": 0.0002, + "num_tokens": 987623.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3327 + }, + { + "clip_ratio/high_max": 0.012195121496915817, + "clip_ratio/high_mean": 0.012195121496915817, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.012195121496915817, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 61.629629629629626, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.281612873077393, + "kl": 0.05813291110098362, + "learning_rate": 2.2243333333333334e-06, + "loss": 0.0149, + "num_tokens": 987924.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 3328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 37.75, + "completions/mean_terminated_length": 37.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 61.648148148148145, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2800800800323486, + "kl": 0.02919972687959671, + "learning_rate": 2.224e-06, + "loss": 0.0681, + "num_tokens": 988299.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 3329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 61.666666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0041159335523843765, + "kl": 0.016093909740447998, + "learning_rate": 2.2236666666666666e-06, + "loss": 0.0008, + "num_tokens": 988559.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 61.68518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040952134877443314, + "kl": 0.00654768873937428, + "learning_rate": 2.2233333333333334e-06, + "loss": 0.0003, + "num_tokens": 988880.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 35.0, + "completions/mean_terminated_length": 35.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 61.7037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4205713272094727, + "kl": 0.027768907137215137, + "learning_rate": 2.223e-06, + "loss": 0.225, + "num_tokens": 989248.0, + "reward": 6.125, + "reward_std": 3.75, + "rewards/reward_combined/mean": 6.125, + "rewards/reward_combined/std": 3.75, + "step": 3332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 61.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05844840779900551, + "kl": 0.15637709200382233, + "learning_rate": 2.222666666666667e-06, + "loss": 0.0078, + "num_tokens": 989557.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 61.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12842020392417908, + "kl": 0.009622111800126731, + "learning_rate": 2.2223333333333333e-06, + "loss": 0.0005, + "num_tokens": 989847.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 61.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01615673303604126, + "kl": 0.0009997934103012085, + "learning_rate": 2.222e-06, + "loss": 0.0, + "num_tokens": 990107.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 61.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7181059122085571, + "kl": 0.04054766148328781, + "learning_rate": 2.2216666666666664e-06, + "loss": 0.0018, + "num_tokens": 990371.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 61.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06952530890703201, + "kl": 0.006099112331867218, + "learning_rate": 2.2213333333333336e-06, + "loss": 0.0003, + "num_tokens": 990615.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 33.25, + "completions/mean_terminated_length": 33.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 61.81481481481482, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1310569047927856, + "kl": 0.04693709872663021, + "learning_rate": 2.221e-06, + "loss": -0.0128, + "num_tokens": 991028.0, + "reward": 2.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 0.25, + "step": 3338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 61.833333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03884014114737511, + "kl": 0.10007043182849884, + "learning_rate": 2.2206666666666668e-06, + "loss": 0.0051, + "num_tokens": 991396.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 27.0, + "completions/mean_terminated_length": 27.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 61.851851851851855, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2800207138061523, + "kl": 0.07188292033970356, + "learning_rate": 2.2203333333333336e-06, + "loss": -0.0054, + "num_tokens": 991732.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 3340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 61.870370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05696495622396469, + "kl": 0.011853685136884451, + "learning_rate": 2.22e-06, + "loss": 0.0006, + "num_tokens": 992022.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 61.888888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061937589198350906, + "kl": 0.005190921947360039, + "learning_rate": 2.2196666666666667e-06, + "loss": 0.0003, + "num_tokens": 992320.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 61.907407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13253596425056458, + "kl": 0.016126180300489068, + "learning_rate": 2.2193333333333335e-06, + "loss": 0.0009, + "num_tokens": 992664.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 61.925925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051882702857255936, + "kl": 0.0064075172413140535, + "learning_rate": 2.2190000000000003e-06, + "loss": 0.0003, + "num_tokens": 992984.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 48.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 48.25, + "completions/mean_terminated_length": 48.25, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 61.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051254939287900925, + "kl": 0.012020382098853588, + "learning_rate": 2.2186666666666666e-06, + "loss": 0.0006, + "num_tokens": 993397.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 61.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07140407711267471, + "kl": 0.0061962848994880915, + "learning_rate": 2.2183333333333334e-06, + "loss": 0.0003, + "num_tokens": 993659.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 61.98148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.329586982727051, + "kl": 0.03178100101649761, + "learning_rate": 2.2179999999999998e-06, + "loss": 0.3282, + "num_tokens": 993949.0, + "reward": 4.625, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 3347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 62.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01830742135643959, + "kl": 0.0025819912552833557, + "learning_rate": 2.2176666666666666e-06, + "loss": 0.0002, + "num_tokens": 994157.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 62.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06955549120903015, + "kl": 0.008563360664993525, + "learning_rate": 2.2173333333333333e-06, + "loss": 0.0004, + "num_tokens": 994431.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 62.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09347642958164215, + "kl": 0.014489857479929924, + "learning_rate": 2.217e-06, + "loss": 0.0007, + "num_tokens": 994767.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 62.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05998816713690758, + "kl": 0.0013189032906666398, + "learning_rate": 2.216666666666667e-06, + "loss": 0.0001, + "num_tokens": 994980.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 62.074074074074076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04640239477157593, + "kl": 0.0019155815825797617, + "learning_rate": 2.2163333333333333e-06, + "loss": 0.0001, + "num_tokens": 995248.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 62.092592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06852405518293381, + "kl": 0.004565507173538208, + "learning_rate": 2.216e-06, + "loss": 0.0002, + "num_tokens": 995464.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 62.111111111111114, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.153611660003662, + "kl": 0.08314665406942368, + "learning_rate": 2.215666666666667e-06, + "loss": 0.045, + "num_tokens": 995760.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 3354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 62.129629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006426460109651089, + "kl": 0.0003775298537220806, + "learning_rate": 2.2153333333333336e-06, + "loss": 0.0, + "num_tokens": 995980.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 27.25, + "completions/mean_terminated_length": 27.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 62.148148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12019623816013336, + "kl": 0.041127199307084084, + "learning_rate": 2.215e-06, + "loss": 0.0021, + "num_tokens": 996309.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 62.166666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037281282246112823, + "kl": 0.004518923815339804, + "learning_rate": 2.2146666666666668e-06, + "loss": 0.0002, + "num_tokens": 996593.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 62.18518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07549259066581726, + "kl": 0.028739141300320625, + "learning_rate": 2.2143333333333335e-06, + "loss": 0.0014, + "num_tokens": 996930.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 62.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027317391708493233, + "kl": 0.003621056559495628, + "learning_rate": 2.214e-06, + "loss": 0.0002, + "num_tokens": 997190.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 62.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06416074931621552, + "kl": 0.004746583057567477, + "learning_rate": 2.2136666666666667e-06, + "loss": 0.0002, + "num_tokens": 997490.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 62.24074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06248186156153679, + "kl": 0.012136355508118868, + "learning_rate": 2.2133333333333335e-06, + "loss": 0.0006, + "num_tokens": 997793.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 62.25925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9482815265655518, + "kl": 0.05561781022697687, + "learning_rate": 2.2130000000000002e-06, + "loss": -0.0597, + "num_tokens": 998120.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 3362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 12.75, + "completions/mean_terminated_length": 12.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 62.27777777777778, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.304361343383789, + "kl": 0.010267814621329308, + "learning_rate": 2.2126666666666666e-06, + "loss": 0.2741, + "num_tokens": 998399.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 3363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 62.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08255226910114288, + "kl": 0.004989365988876671, + "learning_rate": 2.2123333333333334e-06, + "loss": 0.0003, + "num_tokens": 998721.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 40.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 40.5, + "completions/mean_terminated_length": 40.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 62.31481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05197465792298317, + "kl": 0.012701776344329119, + "learning_rate": 2.2119999999999997e-06, + "loss": 0.0005, + "num_tokens": 999103.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 62.333333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1006372720003128, + "kl": 0.018497135490179062, + "learning_rate": 2.2116666666666665e-06, + "loss": 0.001, + "num_tokens": 999390.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 62.351851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11387915909290314, + "kl": 0.013417403679341078, + "learning_rate": 2.2113333333333337e-06, + "loss": 0.0007, + "num_tokens": 999680.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 62.370370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024978553876280785, + "kl": 0.003375243606569711, + "learning_rate": 2.211e-06, + "loss": 0.0002, + "num_tokens": 999944.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 62.388888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06752584129571915, + "kl": 0.004735873662866652, + "learning_rate": 2.210666666666667e-06, + "loss": 0.0002, + "num_tokens": 1000242.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 62.407407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028170356526970863, + "kl": 0.005824573803693056, + "learning_rate": 2.2103333333333332e-06, + "loss": 0.0003, + "num_tokens": 1000510.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 62.425925925925924, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.58154582977295, + "kl": 0.09674317017197609, + "learning_rate": 2.21e-06, + "loss": 0.1458, + "num_tokens": 1000804.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 3371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 8.75, + "completions/mean_terminated_length": 8.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 62.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.5221912860870361, + "kl": 0.29760829266160727, + "learning_rate": 2.209666666666667e-06, + "loss": 0.0141, + "num_tokens": 1001063.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 62.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13651998341083527, + "kl": 0.021502234041690826, + "learning_rate": 2.2093333333333336e-06, + "loss": 0.0011, + "num_tokens": 1001345.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 62.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05915094539523125, + "kl": 0.005745707137975842, + "learning_rate": 2.209e-06, + "loss": 0.0003, + "num_tokens": 1001672.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 34.25, + "completions/mean_terminated_length": 34.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 62.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4944217205047607, + "kl": 0.04194041155278683, + "learning_rate": 2.2086666666666667e-06, + "loss": 0.128, + "num_tokens": 1002037.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 3375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 62.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0517662949860096, + "kl": 0.14405743777751923, + "learning_rate": 2.2083333333333335e-06, + "loss": 0.0072, + "num_tokens": 1002350.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 62.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030326293781399727, + "kl": 0.03141090925782919, + "learning_rate": 2.208e-06, + "loss": 0.0016, + "num_tokens": 1002755.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 3377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 62.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027798175811767578, + "kl": 0.004956083721481264, + "learning_rate": 2.2076666666666666e-06, + "loss": 0.0003, + "num_tokens": 1003078.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 62.574074074074076, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0506179332733154, + "kl": 0.30602581799030304, + "learning_rate": 2.2073333333333334e-06, + "loss": 0.0154, + "num_tokens": 1003382.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 3379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 24.25, + "completions/mean_terminated_length": 24.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 62.592592592592595, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3334875106811523, + "kl": 0.28185568004846573, + "learning_rate": 2.2070000000000002e-06, + "loss": 0.0218, + "num_tokens": 1003715.0, + "reward": 4.75, + "reward_std": 3.4034295082092285, + "rewards/reward_combined/mean": 4.75, + "rewards/reward_combined/std": 3.4034297466278076, + "step": 3380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 62.611111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13829700648784637, + "kl": 0.11752831935882568, + "learning_rate": 2.2066666666666666e-06, + "loss": 0.0059, + "num_tokens": 1004083.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 62.629629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012094398960471153, + "kl": 0.0006010000070091337, + "learning_rate": 2.2063333333333334e-06, + "loss": 0.0, + "num_tokens": 1004318.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 62.648148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.11672842502594, + "kl": 0.046068258583545685, + "learning_rate": 2.2059999999999997e-06, + "loss": 0.0026, + "num_tokens": 1004528.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 39.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.0, + "completions/max_terminated_length": 46.0, + "completions/mean_length": 39.5, + "completions/mean_terminated_length": 39.5, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 62.666666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7228609919548035, + "kl": 0.1081484891474247, + "learning_rate": 2.205666666666667e-06, + "loss": 0.0055, + "num_tokens": 1004910.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 62.68518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007040159311145544, + "kl": 0.00012621581845451146, + "learning_rate": 2.2053333333333337e-06, + "loss": 0.0, + "num_tokens": 1005166.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 62.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1098102331161499, + "kl": 0.01248421985656023, + "learning_rate": 2.205e-06, + "loss": 0.0006, + "num_tokens": 1005438.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 62.72222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003340615890920162, + "kl": 0.0016763443127274513, + "learning_rate": 2.204666666666667e-06, + "loss": 0.0001, + "num_tokens": 1005750.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 62.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00031305052107200027, + "kl": 6.771087646484375e-05, + "learning_rate": 2.204333333333333e-06, + "loss": 0.0, + "num_tokens": 1005970.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 3388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 62.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028128478676080704, + "kl": 0.0009418537665624171, + "learning_rate": 2.204e-06, + "loss": 0.0, + "num_tokens": 1006234.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 36.25, + "completions/mean_terminated_length": 36.25, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 62.77777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6902763843536377, + "kl": 0.1394881308078766, + "learning_rate": 2.2036666666666668e-06, + "loss": 0.0069, + "num_tokens": 1006607.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 62.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.3452072143554688, + "kl": 0.3022227343171835, + "learning_rate": 2.2033333333333336e-06, + "loss": 0.0169, + "num_tokens": 1006934.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 62.81481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032515645027160645, + "kl": 0.0014913790510036051, + "learning_rate": 2.203e-06, + "loss": 0.0001, + "num_tokens": 1007201.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3392 + }, + { + "clip_ratio/high_max": 0.008064515888690948, + "clip_ratio/high_mean": 0.008064515888690948, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008064515888690948, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 62.833333333333336, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.496065616607666, + "kl": 0.1216975748538971, + "learning_rate": 2.2026666666666667e-06, + "loss": -0.1729, + "num_tokens": 1007525.0, + "reward": 7.625, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.625, + "rewards/reward_combined/std": 0.25, + "step": 3393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 62.851851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.32536420226097107, + "kl": 0.04868849087506533, + "learning_rate": 2.2023333333333335e-06, + "loss": 0.0031, + "num_tokens": 1007809.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 62.870370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05128289386630058, + "kl": 0.010341319721192122, + "learning_rate": 2.202e-06, + "loss": 0.0005, + "num_tokens": 1008095.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 31.5, + "completions/mean_terminated_length": 31.5, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 62.888888888888886, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.608768463134766, + "kl": 0.05090206302702427, + "learning_rate": 2.2016666666666666e-06, + "loss": 0.0705, + "num_tokens": 1008437.0, + "reward": 2.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 2.25, + "step": 3396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 62.907407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04566143453121185, + "kl": 0.002851595403626561, + "learning_rate": 2.2013333333333334e-06, + "loss": 0.0001, + "num_tokens": 1008680.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 62.925925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007186815491877496, + "kl": 0.0012371200136840343, + "learning_rate": 2.201e-06, + "loss": 0.0001, + "num_tokens": 1008960.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 62.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07954184710979462, + "kl": 0.001954585313796997, + "learning_rate": 2.2006666666666665e-06, + "loss": 0.0001, + "num_tokens": 1009176.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 62.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0022725940216332674, + "kl": 0.00338079035282135, + "learning_rate": 2.2003333333333333e-06, + "loss": 0.0002, + "num_tokens": 1009412.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.012195121496915817, + "clip_ratio/low_min": 0.012195121496915817, + "clip_ratio/region_mean": 0.012195121496915817, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 62.98148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9990084171295166, + "kl": 0.03532938752323389, + "learning_rate": 2.1999999999999997e-06, + "loss": 0.0991, + "num_tokens": 1009726.0, + "reward": 5.75, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 2.598076105117798, + "step": 3401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 63.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7755279541015625, + "kl": 0.05305712204426527, + "learning_rate": 2.199666666666667e-06, + "loss": -0.0979, + "num_tokens": 1010011.0, + "reward": 7.625, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.625, + "rewards/reward_combined/std": 0.25, + "step": 3402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 63.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1649562120437622, + "kl": 0.01902078534476459, + "learning_rate": 2.1993333333333337e-06, + "loss": 0.001, + "num_tokens": 1010349.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 63.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0024097179993987083, + "kl": 0.003337077796459198, + "learning_rate": 2.199e-06, + "loss": 0.0002, + "num_tokens": 1010585.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 63.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02678484283387661, + "kl": 0.0006867200136184692, + "learning_rate": 2.198666666666667e-06, + "loss": 0.0, + "num_tokens": 1010853.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 63.074074074074076, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.996821165084839, + "kl": 0.2182103544473648, + "learning_rate": 2.198333333333333e-06, + "loss": 0.0521, + "num_tokens": 1011154.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 3406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 63.092592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014491081237792969, + "kl": 0.00044173747301101685, + "learning_rate": 2.198e-06, + "loss": 0.0, + "num_tokens": 1011414.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 33.75, + "completions/mean_terminated_length": 33.75, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 63.111111111111114, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6314876079559326, + "kl": 0.040607634000480175, + "learning_rate": 2.1976666666666667e-06, + "loss": -0.03, + "num_tokens": 1011765.0, + "reward": 4.625, + "reward_std": 2.25, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 2.25, + "step": 3408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 63.129629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04953719675540924, + "kl": 0.005309153348207474, + "learning_rate": 2.1973333333333335e-06, + "loss": 0.0003, + "num_tokens": 1012094.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 63.148148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003084309573750943, + "kl": 7.05718994140625e-05, + "learning_rate": 2.197e-06, + "loss": 0.0, + "num_tokens": 1012314.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 3410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 63.166666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11731244623661041, + "kl": 0.012788111809641123, + "learning_rate": 2.1966666666666667e-06, + "loss": 0.0006, + "num_tokens": 1012609.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 27.25, + "completions/mean_terminated_length": 27.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 63.18518518518518, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.53308629989624, + "kl": 0.05539463832974434, + "learning_rate": 2.1963333333333335e-06, + "loss": -0.0533, + "num_tokens": 1012938.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 3412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 63.2037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.813312530517578, + "kl": 0.09071268513798714, + "learning_rate": 2.196e-06, + "loss": 0.0086, + "num_tokens": 1013255.0, + "reward": 2.25, + "reward_std": 2.020725965499878, + "rewards/reward_combined/mean": 2.25, + "rewards/reward_combined/std": 2.020725965499878, + "step": 3413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.25, + "completions/mean_terminated_length": 23.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 63.22222222222222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053565025329589844, + "kl": 0.023334696888923645, + "learning_rate": 2.195666666666667e-06, + "loss": 0.0012, + "num_tokens": 1013600.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 63.24074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8939841985702515, + "kl": 0.08374106511473656, + "learning_rate": 2.1953333333333334e-06, + "loss": 0.0413, + "num_tokens": 1013937.0, + "reward": 6.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 6.5, + "rewards/reward_combined/std": 2.0, + "step": 3415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 63.25925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026571355760097504, + "kl": 0.0007585063576698303, + "learning_rate": 2.195e-06, + "loss": 0.0, + "num_tokens": 1014181.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 63.27777777777778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005973426625132561, + "kl": 0.0002433204062981531, + "learning_rate": 2.1946666666666665e-06, + "loss": 0.0, + "num_tokens": 1014497.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.5, + "completions/mean_terminated_length": 5.5, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 63.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09025933593511581, + "kl": 0.005301809869706631, + "learning_rate": 2.1943333333333333e-06, + "loss": 0.0003, + "num_tokens": 1014719.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 63.31481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.36134329438209534, + "kl": 0.029291070997714996, + "learning_rate": 2.194e-06, + "loss": 0.002, + "num_tokens": 1014927.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 63.333333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17901213467121124, + "kl": 0.03326728194952011, + "learning_rate": 2.193666666666667e-06, + "loss": 0.0017, + "num_tokens": 1015258.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 28.5, + "completions/mean_terminated_length": 28.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 63.351851851851855, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.0757012367248535, + "kl": 0.26542486995458603, + "learning_rate": 2.1933333333333337e-06, + "loss": 0.342, + "num_tokens": 1015588.0, + "reward": 5.75, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 4.5, + "step": 3421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.75, + "completions/mean_terminated_length": 8.75, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 63.370370370370374, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.303065299987793, + "kl": 0.681626558303833, + "learning_rate": 2.193e-06, + "loss": -0.1514, + "num_tokens": 1015843.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 3422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 63.388888888888886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006917375605553389, + "kl": 0.2675882875919342, + "learning_rate": 2.192666666666667e-06, + "loss": 0.0134, + "num_tokens": 1016147.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 63.407407407407405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15333710610866547, + "kl": 0.004445262253284454, + "learning_rate": 2.192333333333333e-06, + "loss": 0.0002, + "num_tokens": 1016359.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 63.425925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04257291927933693, + "kl": 0.01012834021821618, + "learning_rate": 2.192e-06, + "loss": 0.0005, + "num_tokens": 1016664.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 63.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031284891068935394, + "kl": 0.12643224373459816, + "learning_rate": 2.1916666666666667e-06, + "loss": 0.0064, + "num_tokens": 1017034.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 63.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008705061045475304, + "kl": 0.001197568024508655, + "learning_rate": 2.1913333333333335e-06, + "loss": 0.0001, + "num_tokens": 1017314.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 63.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005729298572987318, + "kl": 0.0014186277985572815, + "learning_rate": 2.191e-06, + "loss": 0.0001, + "num_tokens": 1017530.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 63.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.176250457763672, + "kl": 0.028239657171070576, + "learning_rate": 2.1906666666666666e-06, + "loss": 0.1668, + "num_tokens": 1017815.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 3429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 63.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11604803800582886, + "kl": 0.013675297028385103, + "learning_rate": 2.1903333333333334e-06, + "loss": 0.0007, + "num_tokens": 1018141.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 63.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013029688969254494, + "kl": 0.0003342479467391968, + "learning_rate": 2.19e-06, + "loss": 0.0, + "num_tokens": 1018353.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.25, + "completions/mean_terminated_length": 6.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 63.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05150968208909035, + "kl": 0.0017892661562655121, + "learning_rate": 2.189666666666667e-06, + "loss": 0.0001, + "num_tokens": 1018586.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 31.75, + "completions/mean_terminated_length": 31.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 63.574074074074076, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2737945318222046, + "kl": 0.07921257987618446, + "learning_rate": 2.1893333333333334e-06, + "loss": 0.0277, + "num_tokens": 1018993.0, + "reward": 2.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 0.25, + "step": 3433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 63.592592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008199530653655529, + "kl": 0.001806585118174553, + "learning_rate": 2.189e-06, + "loss": 0.0001, + "num_tokens": 1019305.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 63.611111111111114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02698396146297455, + "kl": 0.0057526868768036366, + "learning_rate": 2.1886666666666665e-06, + "loss": 0.0003, + "num_tokens": 1019583.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 40.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 40.5, + "completions/mean_terminated_length": 40.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 63.629629629629626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04575595632195473, + "kl": 0.013479504734277725, + "learning_rate": 2.1883333333333333e-06, + "loss": 0.0006, + "num_tokens": 1019965.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 63.648148148148145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18682636320590973, + "kl": 0.01179274870082736, + "learning_rate": 2.188e-06, + "loss": 0.0006, + "num_tokens": 1020263.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 63.666666666666664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009331322275102139, + "kl": 0.014926896896213293, + "learning_rate": 2.187666666666667e-06, + "loss": 0.0007, + "num_tokens": 1020523.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 63.68518518518518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02341627888381481, + "kl": 0.006356117781251669, + "learning_rate": 2.1873333333333336e-06, + "loss": 0.0003, + "num_tokens": 1020811.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 12.75, + "completions/mean_terminated_length": 12.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 63.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053721267729997635, + "kl": 0.004764894372783601, + "learning_rate": 2.187e-06, + "loss": 0.0003, + "num_tokens": 1021082.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 30.5, + "completions/mean_terminated_length": 30.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 63.72222222222222, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.549738645553589, + "kl": 0.14344819635152817, + "learning_rate": 2.1866666666666668e-06, + "loss": 0.0602, + "num_tokens": 1021432.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 3441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 63.74074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01474683452397585, + "kl": 0.0010590986930765212, + "learning_rate": 2.186333333333333e-06, + "loss": 0.0001, + "num_tokens": 1021700.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 63.75925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11663239449262619, + "kl": 0.041027034632861614, + "learning_rate": 2.186e-06, + "loss": 0.0021, + "num_tokens": 1021990.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 27.5, + "completions/mean_terminated_length": 27.5, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 63.77777777777778, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.997528076171875, + "kl": 0.04900665208697319, + "learning_rate": 2.1856666666666667e-06, + "loss": -0.066, + "num_tokens": 1022328.0, + "reward": 7.625, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.625, + "rewards/reward_combined/std": 0.25, + "step": 3444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 63.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033921096473932266, + "kl": 0.0043184710666537285, + "learning_rate": 2.1853333333333335e-06, + "loss": 0.0002, + "num_tokens": 1022612.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 63.81481481481482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03759907931089401, + "kl": 0.002453757624607533, + "learning_rate": 2.1850000000000003e-06, + "loss": 0.0001, + "num_tokens": 1022914.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 63.833333333333336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0442030169069767, + "kl": 0.15268483757972717, + "learning_rate": 2.1846666666666666e-06, + "loss": 0.0076, + "num_tokens": 1023223.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 63.851851851851855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059195198118686676, + "kl": 0.009379944764077663, + "learning_rate": 2.1843333333333334e-06, + "loss": 0.0005, + "num_tokens": 1023497.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 63.870370370370374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03964879363775253, + "kl": 0.007741866167634726, + "learning_rate": 2.184e-06, + "loss": 0.0004, + "num_tokens": 1023792.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 33.0, + "completions/mean_terminated_length": 33.0, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 63.888888888888886, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.095860481262207, + "kl": 0.023096129298210144, + "learning_rate": 2.183666666666667e-06, + "loss": 0.0493, + "num_tokens": 1024148.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 3450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 63.907407407407405, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3290202617645264, + "kl": 0.04408697225153446, + "learning_rate": 2.1833333333333333e-06, + "loss": 0.0614, + "num_tokens": 1024439.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 3451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 63.925925925925924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.263012558221817, + "kl": 0.061270684003829956, + "learning_rate": 2.183e-06, + "loss": 0.0031, + "num_tokens": 1024735.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 63.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034099407494068146, + "kl": 0.0060203049797564745, + "learning_rate": 2.1826666666666665e-06, + "loss": 0.0003, + "num_tokens": 1025003.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 63.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044186048209667206, + "kl": 0.0025317840045318007, + "learning_rate": 2.1823333333333332e-06, + "loss": 0.0001, + "num_tokens": 1025274.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 63.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09788111597299576, + "kl": 0.003438621759414673, + "learning_rate": 2.182e-06, + "loss": 0.0002, + "num_tokens": 1025530.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 64.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007005822379142046, + "kl": 0.0006105720822233707, + "learning_rate": 2.181666666666667e-06, + "loss": 0.0, + "num_tokens": 1025790.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 64.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02793942205607891, + "kl": 0.12566199526190758, + "learning_rate": 2.1813333333333336e-06, + "loss": 0.0063, + "num_tokens": 1026160.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 64.03703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.262382507324219, + "kl": 1.774953931570053, + "learning_rate": 2.181e-06, + "loss": 0.1041, + "num_tokens": 1026465.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 3458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 64.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15645377337932587, + "kl": 0.02274109423160553, + "learning_rate": 2.1806666666666667e-06, + "loss": 0.0012, + "num_tokens": 1026754.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 64.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03744257614016533, + "kl": 0.0033129056682810187, + "learning_rate": 2.180333333333333e-06, + "loss": 0.0001, + "num_tokens": 1027016.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 64.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04662017151713371, + "kl": 0.0016747861409385223, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.0001, + "num_tokens": 1027272.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 64.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0083814337849617, + "kl": 0.0016384795308113098, + "learning_rate": 2.1796666666666667e-06, + "loss": 0.0001, + "num_tokens": 1027584.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 64.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00520468782633543, + "kl": 0.000131264328956604, + "learning_rate": 2.1793333333333334e-06, + "loss": 0.0, + "num_tokens": 1027796.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 64.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.279834508895874, + "kl": 0.03943028347566724, + "learning_rate": 2.1790000000000002e-06, + "loss": 0.0024, + "num_tokens": 1028078.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 64.16666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3873960971832275, + "kl": 0.054739379324018955, + "learning_rate": 2.1786666666666666e-06, + "loss": -0.0447, + "num_tokens": 1028405.0, + "reward": 6.75, + "reward_std": 2.5, + "rewards/reward_combined/mean": 6.75, + "rewards/reward_combined/std": 2.5, + "step": 3465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 64.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02080976776778698, + "kl": 0.0014346256357384846, + "learning_rate": 2.1783333333333334e-06, + "loss": 0.0001, + "num_tokens": 1028624.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 64.20370370370371, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.331995725631714, + "kl": 0.045357080176472664, + "learning_rate": 2.178e-06, + "loss": -0.0537, + "num_tokens": 1028925.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 3467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 64.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013082461431622505, + "kl": 0.0006137114251032472, + "learning_rate": 2.177666666666667e-06, + "loss": 0.0, + "num_tokens": 1029159.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 64.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07272084057331085, + "kl": 0.004364542197436094, + "learning_rate": 2.1773333333333333e-06, + "loss": 0.0002, + "num_tokens": 1029486.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 64.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0633191391825676, + "kl": 0.003674787236377597, + "learning_rate": 2.177e-06, + "loss": 0.0002, + "num_tokens": 1029750.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 64.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010293482802808285, + "kl": 0.014683597721159458, + "learning_rate": 2.1766666666666664e-06, + "loss": 0.0007, + "num_tokens": 1030010.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 64.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1277714967727661, + "kl": 0.0244381595402956, + "learning_rate": 2.1763333333333332e-06, + "loss": 0.0012, + "num_tokens": 1030333.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3472 + }, + { + "clip_ratio/high_max": 0.0021276595070958138, + "clip_ratio/high_mean": 0.0021276595070958138, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021276595070958138, + "completion_length": 78.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 78.5, + "completions/mean_terminated_length": 78.5, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 64.31481481481481, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.871088981628418, + "kl": 0.05076884664595127, + "learning_rate": 2.176e-06, + "loss": 0.0865, + "num_tokens": 1030863.0, + "reward": 2.75, + "reward_std": 1.1902379989624023, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.190238118171692, + "step": 3473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 64.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4247271716594696, + "kl": 0.04272638913244009, + "learning_rate": 2.1756666666666668e-06, + "loss": 0.0021, + "num_tokens": 1031175.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 64.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008122501894831657, + "kl": 0.0012579411268234253, + "learning_rate": 2.1753333333333336e-06, + "loss": 0.0001, + "num_tokens": 1031455.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 64.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07620169222354889, + "kl": 0.016548125073313713, + "learning_rate": 2.175e-06, + "loss": 0.0008, + "num_tokens": 1031750.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 33.75, + "completions/mean_terminated_length": 33.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 64.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07757337391376495, + "kl": 0.025181420147418976, + "learning_rate": 2.1746666666666667e-06, + "loss": 0.0013, + "num_tokens": 1032109.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 64.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2446059137582779, + "kl": 0.03443950600922108, + "learning_rate": 2.1743333333333335e-06, + "loss": 0.0017, + "num_tokens": 1032411.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 64.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01193982269614935, + "kl": 0.000527895987033844, + "learning_rate": 2.1740000000000003e-06, + "loss": 0.0, + "num_tokens": 1032725.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 64.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08313068747520447, + "kl": 0.02006299328058958, + "learning_rate": 2.1736666666666666e-06, + "loss": 0.001, + "num_tokens": 1033071.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 64.46296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3883612155914307, + "kl": 0.17425037547945976, + "learning_rate": 2.1733333333333334e-06, + "loss": 0.0479, + "num_tokens": 1033408.0, + "reward": 5.25, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 2.598076105117798, + "step": 3481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 64.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09303069114685059, + "kl": 0.006640292820520699, + "learning_rate": 2.173e-06, + "loss": 0.0004, + "num_tokens": 1033672.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 64.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01857859455049038, + "kl": 0.001703709363937378, + "learning_rate": 2.1726666666666666e-06, + "loss": 0.0001, + "num_tokens": 1033888.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 64.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09799665957689285, + "kl": 0.007824670989066362, + "learning_rate": 2.1723333333333333e-06, + "loss": 0.0004, + "num_tokens": 1034181.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 64.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005649257451295853, + "kl": 0.0004132688045501709, + "learning_rate": 2.172e-06, + "loss": 0.0, + "num_tokens": 1034441.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 64.55555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.091744899749756, + "kl": 0.04026716947555542, + "learning_rate": 2.171666666666667e-06, + "loss": 0.1012, + "num_tokens": 1034727.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 3486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 64.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019582778215408325, + "kl": 0.0008381694206036627, + "learning_rate": 2.1713333333333333e-06, + "loss": 0.0, + "num_tokens": 1034991.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 64.5925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4010632038116455, + "kl": 0.0780010549351573, + "learning_rate": 2.171e-06, + "loss": 0.0308, + "num_tokens": 1035295.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 3488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 64.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04042023792862892, + "kl": 0.002430828579235822, + "learning_rate": 2.1706666666666664e-06, + "loss": 0.0001, + "num_tokens": 1035593.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 64.62962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.300741195678711, + "kl": 0.21128271240741014, + "learning_rate": 2.170333333333333e-06, + "loss": -0.0454, + "num_tokens": 1035946.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 3490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 64.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026651019230484962, + "kl": 0.006115816533565521, + "learning_rate": 2.1700000000000004e-06, + "loss": 0.0003, + "num_tokens": 1036234.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 64.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049924980849027634, + "kl": 0.0010692626237869263, + "learning_rate": 2.1696666666666668e-06, + "loss": 0.0001, + "num_tokens": 1036442.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 28.0, + "completions/mean_terminated_length": 28.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 64.68518518518519, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.594965696334839, + "kl": 0.05903918435797095, + "learning_rate": 2.1693333333333335e-06, + "loss": -0.1254, + "num_tokens": 1036782.0, + "reward": 6.125, + "reward_std": 3.75, + "rewards/reward_combined/mean": 6.125, + "rewards/reward_combined/std": 3.75, + "step": 3493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 64.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014029777608811855, + "kl": 0.0003986656665802002, + "learning_rate": 2.169e-06, + "loss": 0.0, + "num_tokens": 1036994.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 64.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07352840155363083, + "kl": 0.00826783082447946, + "learning_rate": 2.1686666666666667e-06, + "loss": 0.0004, + "num_tokens": 1037262.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 64.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05641140416264534, + "kl": 0.009500264655798674, + "learning_rate": 2.1683333333333335e-06, + "loss": 0.0005, + "num_tokens": 1037532.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 64.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0771590918302536, + "kl": 0.006277984473854303, + "learning_rate": 2.1680000000000002e-06, + "loss": 0.0003, + "num_tokens": 1037836.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 31.75, + "completions/mean_terminated_length": 31.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 64.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04200340434908867, + "kl": 0.042444733902812004, + "learning_rate": 2.1676666666666666e-06, + "loss": 0.0021, + "num_tokens": 1038243.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 3498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 64.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002923685824498534, + "kl": 7.661432027816772e-05, + "learning_rate": 2.1673333333333334e-06, + "loss": 0.0, + "num_tokens": 1038463.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 3499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 64.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05609564855694771, + "kl": 0.01435369485989213, + "learning_rate": 2.167e-06, + "loss": 0.0008, + "num_tokens": 1038749.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 64.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05979641154408455, + "kl": 0.14969105273485184, + "learning_rate": 2.1666666666666665e-06, + "loss": 0.0075, + "num_tokens": 1039060.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 64.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044393863528966904, + "kl": 0.005250045796856284, + "learning_rate": 2.1663333333333333e-06, + "loss": 0.0003, + "num_tokens": 1039344.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 64.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0031597509514540434, + "kl": 0.003224901854991913, + "learning_rate": 2.166e-06, + "loss": 0.0002, + "num_tokens": 1039580.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 36.0, + "completions/mean_terminated_length": 36.0, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 64.88888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.353760004043579, + "kl": 0.0762135200202465, + "learning_rate": 2.165666666666667e-06, + "loss": -0.1731, + "num_tokens": 1039944.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 3504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 92.5, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 92.5, + "completions/mean_terminated_length": 38.0, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 64.9074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4393672943115234, + "kl": 0.1531917154788971, + "learning_rate": 2.1653333333333332e-06, + "loss": 0.004, + "num_tokens": 1040542.0, + "reward": 3.375, + "reward_std": 3.3008837699890137, + "rewards/reward_combined/mean": 3.375, + "rewards/reward_combined/std": 3.3008837699890137, + "step": 3505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 64.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13756895065307617, + "kl": 0.01514653256163001, + "learning_rate": 2.165e-06, + "loss": 0.0009, + "num_tokens": 1040822.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 64.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.1609680652618408, + "kl": 0.0838811844587326, + "learning_rate": 2.1646666666666664e-06, + "loss": 0.0042, + "num_tokens": 1041066.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 64.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.657221257686615, + "kl": 0.0866007343865931, + "learning_rate": 2.1643333333333336e-06, + "loss": 0.0026, + "num_tokens": 1041320.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 64.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043962541967630386, + "kl": 0.0067591299302875996, + "learning_rate": 2.1640000000000004e-06, + "loss": 0.0004, + "num_tokens": 1041590.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 65.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.141776084899902, + "kl": 0.03479503915878013, + "learning_rate": 2.1636666666666667e-06, + "loss": 0.044, + "num_tokens": 1041914.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 3510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 36.5, + "completions/mean_terminated_length": 36.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 65.01851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9385812282562256, + "kl": 0.050240641459822655, + "learning_rate": 2.1633333333333335e-06, + "loss": 0.0373, + "num_tokens": 1042284.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 3511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 65.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14359606802463531, + "kl": 0.018712486838921905, + "learning_rate": 2.163e-06, + "loss": 0.0009, + "num_tokens": 1042590.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 65.05555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.02384090423584, + "kl": 0.012163963634520769, + "learning_rate": 2.1626666666666667e-06, + "loss": 0.1879, + "num_tokens": 1042866.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 3513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 65.07407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.820356607437134, + "kl": 0.11114047560840845, + "learning_rate": 2.1623333333333334e-06, + "loss": 0.023, + "num_tokens": 1043153.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 3514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 65.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009961447678506374, + "kl": 0.014737188816070557, + "learning_rate": 2.1620000000000002e-06, + "loss": 0.0007, + "num_tokens": 1043413.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 65.11111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.403839588165283, + "kl": 0.009551340248435736, + "learning_rate": 2.1616666666666666e-06, + "loss": -0.0425, + "num_tokens": 1043695.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 3516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 26.75, + "completions/mean_terminated_length": 26.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 65.12962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.660060405731201, + "kl": 0.025711173191666603, + "learning_rate": 2.1613333333333334e-06, + "loss": 0.1979, + "num_tokens": 1044054.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 3517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 65.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04941617697477341, + "kl": 0.006203743629157543, + "learning_rate": 2.161e-06, + "loss": 0.0003, + "num_tokens": 1044322.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 36.0, + "completions/mean_terminated_length": 36.0, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 65.16666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6261063814163208, + "kl": 0.39721184223890305, + "learning_rate": 2.1606666666666665e-06, + "loss": -0.0546, + "num_tokens": 1044694.0, + "reward": 7.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.875, + "rewards/reward_combined/std": 0.25, + "step": 3519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 65.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012366310693323612, + "kl": 0.002239805646240711, + "learning_rate": 2.1603333333333333e-06, + "loss": 0.0001, + "num_tokens": 1045006.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 65.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.33784300088882446, + "kl": 0.026781465858221054, + "learning_rate": 2.16e-06, + "loss": 0.002, + "num_tokens": 1045266.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 65.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021401792764663696, + "kl": 0.0011554970405995846, + "learning_rate": 2.159666666666667e-06, + "loss": 0.0001, + "num_tokens": 1045562.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 65.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01038454007357359, + "kl": 0.0004698584525613114, + "learning_rate": 2.1593333333333332e-06, + "loss": 0.0, + "num_tokens": 1045797.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 27.5, + "completions/mean_terminated_length": 27.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 65.25925925925925, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6637165546417236, + "kl": 0.16146893240511417, + "learning_rate": 2.159e-06, + "loss": 0.0344, + "num_tokens": 1046135.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 3524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 65.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019443562254309654, + "kl": 0.002966132014989853, + "learning_rate": 2.1586666666666664e-06, + "loss": 0.0001, + "num_tokens": 1046403.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 39.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 39.25, + "completions/mean_terminated_length": 39.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 65.29629629629629, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.420718669891357, + "kl": 0.033904400654137135, + "learning_rate": 2.1583333333333336e-06, + "loss": -0.2862, + "num_tokens": 1046780.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 3526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 65.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8334104418754578, + "kl": 0.1618670579046011, + "learning_rate": 2.1580000000000003e-06, + "loss": 0.0086, + "num_tokens": 1047117.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 65.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.2802926301956177, + "kl": 0.2563718780875206, + "learning_rate": 2.1576666666666667e-06, + "loss": 0.0105, + "num_tokens": 1047415.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 65.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058315105736255646, + "kl": 0.2578686773777008, + "learning_rate": 2.1573333333333335e-06, + "loss": 0.0129, + "num_tokens": 1047719.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 65.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0315011627972126, + "kl": 0.04525031894445419, + "learning_rate": 2.157e-06, + "loss": 0.0023, + "num_tokens": 1048123.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 3530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 65.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009876912459731102, + "kl": 0.0001265406608581543, + "learning_rate": 2.1566666666666666e-06, + "loss": 0.0, + "num_tokens": 1048335.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 65.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061519473791122437, + "kl": 0.006890603573992848, + "learning_rate": 2.1563333333333334e-06, + "loss": 0.0004, + "num_tokens": 1048665.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 65.42592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.128335475921631, + "kl": 0.04269075766205788, + "learning_rate": 2.156e-06, + "loss": 0.0712, + "num_tokens": 1048961.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 3533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 2.5, + "completions/mean_terminated_length": 2.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 65.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15544039011001587, + "kl": 0.005920931696891785, + "learning_rate": 2.1556666666666666e-06, + "loss": 0.0004, + "num_tokens": 1049167.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 12.75, + "completions/mean_terminated_length": 12.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 65.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06937694549560547, + "kl": 0.0030993041582405567, + "learning_rate": 2.1553333333333333e-06, + "loss": 0.0002, + "num_tokens": 1049430.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 65.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0021859167609363794, + "kl": 8.884072303771973e-05, + "learning_rate": 2.155e-06, + "loss": 0.0, + "num_tokens": 1049642.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 65.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002480959054082632, + "kl": 0.0033598914742469788, + "learning_rate": 2.1546666666666665e-06, + "loss": 0.0002, + "num_tokens": 1049878.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 65.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040756918489933014, + "kl": 0.016550449654459953, + "learning_rate": 2.1543333333333337e-06, + "loss": 0.0008, + "num_tokens": 1050170.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 65.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048881612718105316, + "kl": 0.006315226026345044, + "learning_rate": 2.154e-06, + "loss": 0.0003, + "num_tokens": 1050458.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 65.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027717215940356255, + "kl": 0.005355539731681347, + "learning_rate": 2.153666666666667e-06, + "loss": 0.0003, + "num_tokens": 1050730.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 65.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031059106811881065, + "kl": 0.0015912905109871645, + "learning_rate": 2.153333333333333e-06, + "loss": 0.0001, + "num_tokens": 1050949.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 65.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0925585925579071, + "kl": 0.05042813625186682, + "learning_rate": 2.153e-06, + "loss": 0.0025, + "num_tokens": 1051261.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 65.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8912014961242676, + "kl": 0.048889182100538164, + "learning_rate": 2.1526666666666668e-06, + "loss": 0.0174, + "num_tokens": 1051574.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 3543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 31.75, + "completions/mean_terminated_length": 31.75, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 65.62962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8815157413482666, + "kl": 0.05955894012004137, + "learning_rate": 2.1523333333333335e-06, + "loss": -0.0723, + "num_tokens": 1051917.0, + "reward": 4.625, + "reward_std": 2.25, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 2.25, + "step": 3544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 9.75, + "completions/mean_terminated_length": 9.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 65.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07245248556137085, + "kl": 0.004652391420677304, + "learning_rate": 2.1520000000000003e-06, + "loss": 0.0003, + "num_tokens": 1052156.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 65.66666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.429896831512451, + "kl": 0.13123369216918945, + "learning_rate": 2.1516666666666667e-06, + "loss": -0.0239, + "num_tokens": 1052464.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 3546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 65.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07170914113521576, + "kl": 0.018998458050191402, + "learning_rate": 2.1513333333333335e-06, + "loss": 0.001, + "num_tokens": 1052808.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 65.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003020386502612382, + "kl": 6.897002458572388e-05, + "learning_rate": 2.151e-06, + "loss": 0.0, + "num_tokens": 1053028.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 3548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 65.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.32641804218292236, + "kl": 0.06980939954519272, + "learning_rate": 2.1506666666666666e-06, + "loss": 0.0033, + "num_tokens": 1053340.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 65.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06636328995227814, + "kl": 0.019938739016652107, + "learning_rate": 2.1503333333333334e-06, + "loss": 0.001, + "num_tokens": 1053660.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 65.75925925925925, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.9884839057922363, + "kl": 0.04400889610406011, + "learning_rate": 2.15e-06, + "loss": 0.0032, + "num_tokens": 1053985.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 3551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 65.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019705817103385925, + "kl": 0.006303793750703335, + "learning_rate": 2.1496666666666665e-06, + "loss": 0.0003, + "num_tokens": 1054273.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 65.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022153222933411598, + "kl": 0.0006537400186061859, + "learning_rate": 2.1493333333333333e-06, + "loss": 0.0, + "num_tokens": 1054533.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 65.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14532485604286194, + "kl": 0.02496556844562292, + "learning_rate": 2.149e-06, + "loss": 0.001, + "num_tokens": 1054799.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 65.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060010410845279694, + "kl": 0.0015776307554915547, + "learning_rate": 2.148666666666667e-06, + "loss": 0.0001, + "num_tokens": 1055055.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 65.85185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.3620195388793945, + "kl": 0.008782767690718174, + "learning_rate": 2.1483333333333337e-06, + "loss": 0.2029, + "num_tokens": 1055395.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 3556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 65.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013834363780915737, + "kl": 0.0009517103608231992, + "learning_rate": 2.148e-06, + "loss": 0.0, + "num_tokens": 1055655.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 65.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2045542448759079, + "kl": 0.04649087227880955, + "learning_rate": 2.147666666666667e-06, + "loss": 0.0023, + "num_tokens": 1055943.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 65.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14134031534194946, + "kl": 0.025466084945946932, + "learning_rate": 2.147333333333333e-06, + "loss": 0.0012, + "num_tokens": 1056214.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 65.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00889053288847208, + "kl": 0.008961380459368229, + "learning_rate": 2.147e-06, + "loss": 0.0004, + "num_tokens": 1056486.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 65.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024751445278525352, + "kl": 0.0030332550404637004, + "learning_rate": 2.1466666666666667e-06, + "loss": 0.0002, + "num_tokens": 1056752.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 65.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00039961549919098616, + "kl": 0.0012107896036468446, + "learning_rate": 2.1463333333333335e-06, + "loss": 0.0001, + "num_tokens": 1057032.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 65.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8616456389427185, + "kl": 0.20754162967205048, + "learning_rate": 2.1460000000000003e-06, + "loss": 0.0104, + "num_tokens": 1057400.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 66.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12785805761814117, + "kl": 0.010238808114081621, + "learning_rate": 2.1456666666666666e-06, + "loss": 0.0005, + "num_tokens": 1057660.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 66.01851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.238522529602051, + "kl": 0.09039102122187614, + "learning_rate": 2.1453333333333334e-06, + "loss": -0.088, + "num_tokens": 1057990.0, + "reward": 5.0, + "reward_std": 3.674234628677368, + "rewards/reward_combined/mean": 5.0, + "rewards/reward_combined/std": 3.674234628677368, + "step": 3565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 66.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009274467825889587, + "kl": 0.008835397195070982, + "learning_rate": 2.145e-06, + "loss": 0.0004, + "num_tokens": 1058262.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 33.25, + "completions/mean_terminated_length": 33.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 66.05555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4824423789978027, + "kl": 0.03818679414689541, + "learning_rate": 2.1446666666666666e-06, + "loss": 0.0215, + "num_tokens": 1058611.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 3567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 66.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1615619659423828, + "kl": 0.02284571062773466, + "learning_rate": 2.1443333333333334e-06, + "loss": 0.0012, + "num_tokens": 1058875.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 29.75, + "completions/mean_terminated_length": 29.75, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 66.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20201905071735382, + "kl": 0.056334005668759346, + "learning_rate": 2.144e-06, + "loss": 0.0027, + "num_tokens": 1059214.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 66.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11449134349822998, + "kl": 0.015466476790606976, + "learning_rate": 2.143666666666667e-06, + "loss": 0.0005, + "num_tokens": 1059468.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 66.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03221873193979263, + "kl": 0.14802279323339462, + "learning_rate": 2.1433333333333333e-06, + "loss": 0.0074, + "num_tokens": 1059780.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 33.0, + "completions/mean_terminated_length": 33.0, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 66.14814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.887183666229248, + "kl": 0.04516427032649517, + "learning_rate": 2.143e-06, + "loss": -0.0965, + "num_tokens": 1060140.0, + "reward": 6.75, + "reward_std": 2.1794495582580566, + "rewards/reward_combined/mean": 6.75, + "rewards/reward_combined/std": 2.1794495582580566, + "step": 3572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 42.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 42.75, + "completions/mean_terminated_length": 42.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 66.16666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.156033754348755, + "kl": 0.13535203784704208, + "learning_rate": 2.142666666666667e-06, + "loss": -0.1149, + "num_tokens": 1060531.0, + "reward": 4.625, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 3573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 66.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02052474021911621, + "kl": 0.004775805864483118, + "learning_rate": 2.1423333333333336e-06, + "loss": 0.0002, + "num_tokens": 1060803.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 66.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00621081842109561, + "kl": 0.0018688691779971123, + "learning_rate": 2.142e-06, + "loss": 0.0001, + "num_tokens": 1061115.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 66.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0538310632109642, + "kl": 0.09351903200149536, + "learning_rate": 2.1416666666666668e-06, + "loss": 0.0048, + "num_tokens": 1061483.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 66.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037144873291254044, + "kl": 0.007186120608821511, + "learning_rate": 2.141333333333333e-06, + "loss": 0.0004, + "num_tokens": 1061772.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 9.75, + "completions/mean_terminated_length": 9.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 66.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13158610463142395, + "kl": 0.014042772352695465, + "learning_rate": 2.141e-06, + "loss": 0.0007, + "num_tokens": 1062031.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 66.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0042854901403188705, + "kl": 0.00015013217489467934, + "learning_rate": 2.1406666666666667e-06, + "loss": 0.0, + "num_tokens": 1062251.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 66.29629629629629, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.590139150619507, + "kl": 0.17878342419862747, + "learning_rate": 2.1403333333333335e-06, + "loss": 0.0085, + "num_tokens": 1062557.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 3580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 36.25, + "completions/mean_terminated_length": 36.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 66.31481481481481, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7620913982391357, + "kl": 0.03124841209501028, + "learning_rate": 2.1400000000000003e-06, + "loss": 0.09, + "num_tokens": 1062982.0, + "reward": 2.799999952316284, + "reward_std": 0.4000000059604645, + "rewards/reward_combined/mean": 2.799999952316284, + "rewards/reward_combined/std": 0.4000000059604645, + "step": 3581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 66.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04130195826292038, + "kl": 0.005478741135448217, + "learning_rate": 2.1396666666666666e-06, + "loss": 0.0003, + "num_tokens": 1063250.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 66.35185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2970640659332275, + "kl": 0.01648002862930298, + "learning_rate": 2.1393333333333334e-06, + "loss": 0.0014, + "num_tokens": 1063538.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 3583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 66.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04830180108547211, + "kl": 0.00431014085188508, + "learning_rate": 2.1389999999999998e-06, + "loss": 0.0002, + "num_tokens": 1063850.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 66.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011130349710583687, + "kl": 0.0005721114575862885, + "learning_rate": 2.138666666666667e-06, + "loss": 0.0, + "num_tokens": 1064110.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 66.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029078495572321117, + "kl": 7.481127977371216e-05, + "learning_rate": 2.1383333333333333e-06, + "loss": 0.0, + "num_tokens": 1064330.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 3586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 66.42592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 10.302885055541992, + "kl": 1.9815361201763153, + "learning_rate": 2.138e-06, + "loss": 0.0318, + "num_tokens": 1064628.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 3587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 66.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008370169438421726, + "kl": 0.00010447204113006592, + "learning_rate": 2.137666666666667e-06, + "loss": 0.0, + "num_tokens": 1064840.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 34.25, + "completions/mean_terminated_length": 34.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 66.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08436626195907593, + "kl": 0.029362904839217663, + "learning_rate": 2.1373333333333333e-06, + "loss": 0.0015, + "num_tokens": 1065201.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 66.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0347416065633297, + "kl": 0.0031578628113493323, + "learning_rate": 2.137e-06, + "loss": 0.0002, + "num_tokens": 1065471.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 66.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3766398429870605, + "kl": 0.04959502071142197, + "learning_rate": 2.136666666666667e-06, + "loss": 0.0242, + "num_tokens": 1065805.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 3591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 66.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010061761364340782, + "kl": 0.00023939609673107043, + "learning_rate": 2.1363333333333336e-06, + "loss": 0.0, + "num_tokens": 1066061.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 66.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05835021659731865, + "kl": 0.003006845712661743, + "learning_rate": 2.136e-06, + "loss": 0.0001, + "num_tokens": 1066328.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 66.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032158125191926956, + "kl": 0.0005912482738494873, + "learning_rate": 2.1356666666666667e-06, + "loss": 0.0, + "num_tokens": 1066536.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 66.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09703069925308228, + "kl": 0.005293893162161112, + "learning_rate": 2.135333333333333e-06, + "loss": 0.0003, + "num_tokens": 1066784.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 66.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06441410630941391, + "kl": 0.013853597454726696, + "learning_rate": 2.135e-06, + "loss": 0.0007, + "num_tokens": 1067111.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 66.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02377534843981266, + "kl": 0.005178789375349879, + "learning_rate": 2.1346666666666667e-06, + "loss": 0.0003, + "num_tokens": 1067389.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 66.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039628561586141586, + "kl": 0.008012112695723772, + "learning_rate": 2.1343333333333335e-06, + "loss": 0.0004, + "num_tokens": 1067695.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 66.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035798411816358566, + "kl": 0.001465982524678111, + "learning_rate": 2.1340000000000002e-06, + "loss": 0.0001, + "num_tokens": 1067957.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 66.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04002681002020836, + "kl": 0.0018101908499374986, + "learning_rate": 2.1336666666666666e-06, + "loss": 0.0001, + "num_tokens": 1068241.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 66.68518518518519, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8050531148910522, + "kl": 0.00274230184732005, + "learning_rate": 2.1333333333333334e-06, + "loss": 0.0297, + "num_tokens": 1068559.0, + "reward": 5.25, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 4.5, + "step": 3601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 66.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1238691657781601, + "kl": 0.042561380192637444, + "learning_rate": 2.133e-06, + "loss": 0.0021, + "num_tokens": 1068859.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 66.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007309718639589846, + "kl": 0.0012424737215042114, + "learning_rate": 2.132666666666667e-06, + "loss": 0.0001, + "num_tokens": 1069139.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 66.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0442817322909832, + "kl": 0.009598115226253867, + "learning_rate": 2.1323333333333333e-06, + "loss": 0.0005, + "num_tokens": 1069423.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 66.75925925925925, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.423217296600342, + "kl": 0.2912338078022003, + "learning_rate": 2.132e-06, + "loss": -0.0729, + "num_tokens": 1069693.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 3605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 66.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06318531185388565, + "kl": 0.003326006233692169, + "learning_rate": 2.131666666666667e-06, + "loss": 0.0002, + "num_tokens": 1069909.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 66.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057171136140823364, + "kl": 0.028508439660072327, + "learning_rate": 2.1313333333333332e-06, + "loss": 0.0014, + "num_tokens": 1070246.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 66.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021175920963287354, + "kl": 0.009201560635119677, + "learning_rate": 2.131e-06, + "loss": 0.0005, + "num_tokens": 1070548.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 66.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014865121804177761, + "kl": 0.00020401179790496826, + "learning_rate": 2.130666666666667e-06, + "loss": 0.0, + "num_tokens": 1070760.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 66.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005663186893798411, + "kl": 7.785111847624648e-06, + "learning_rate": 2.1303333333333336e-06, + "loss": 0.0, + "num_tokens": 1071030.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 66.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002381725935265422, + "kl": 0.003385581076145172, + "learning_rate": 2.13e-06, + "loss": 0.0002, + "num_tokens": 1071266.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 33.5, + "completions/mean_terminated_length": 33.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 66.88888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.923074245452881, + "kl": 0.21537644416093826, + "learning_rate": 2.1296666666666667e-06, + "loss": 0.0855, + "num_tokens": 1071628.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 3612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 66.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13695292174816132, + "kl": 0.025151774752885103, + "learning_rate": 2.129333333333333e-06, + "loss": 0.0013, + "num_tokens": 1071920.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 66.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013088555075228214, + "kl": 0.0007829467358533293, + "learning_rate": 2.129e-06, + "loss": 0.0, + "num_tokens": 1072155.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 66.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027377335354685783, + "kl": 0.004256198415532708, + "learning_rate": 2.128666666666667e-06, + "loss": 0.0002, + "num_tokens": 1072477.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 66.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23087462782859802, + "kl": 0.01953520847018808, + "learning_rate": 2.1283333333333334e-06, + "loss": 0.001, + "num_tokens": 1072775.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 66.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15868648886680603, + "kl": 0.026405527256429195, + "learning_rate": 2.128e-06, + "loss": 0.0013, + "num_tokens": 1073061.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 67.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12318392843008041, + "kl": 0.014631701167672873, + "learning_rate": 2.1276666666666666e-06, + "loss": 0.0007, + "num_tokens": 1073397.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 67.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0023509350139647722, + "kl": 0.0033880844712257385, + "learning_rate": 2.1273333333333334e-06, + "loss": 0.0002, + "num_tokens": 1073633.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 67.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013437964022159576, + "kl": 0.00028736889362335205, + "learning_rate": 2.127e-06, + "loss": 0.0, + "num_tokens": 1073845.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 67.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006805545650422573, + "kl": 0.0007283776940312237, + "learning_rate": 2.126666666666667e-06, + "loss": 0.0, + "num_tokens": 1074105.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.007575757801532745, + "clip_ratio/low_min": 0.007575757801532745, + "clip_ratio/region_mean": 0.007575757801532745, + "completion_length": 33.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 33.75, + "completions/mean_terminated_length": 33.75, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 67.07407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.9388680458068848, + "kl": 0.2690776288509369, + "learning_rate": 2.1263333333333333e-06, + "loss": -0.0365, + "num_tokens": 1074468.0, + "reward": 6.75, + "reward_std": 2.1794495582580566, + "rewards/reward_combined/mean": 6.75, + "rewards/reward_combined/std": 2.1794495582580566, + "step": 3622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 67.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045186080038547516, + "kl": 0.0023676478303968906, + "learning_rate": 2.126e-06, + "loss": 0.0001, + "num_tokens": 1074735.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 67.11111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7333438396453857, + "kl": 0.16704384982585907, + "learning_rate": 2.125666666666667e-06, + "loss": 0.04, + "num_tokens": 1075071.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 3624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 45.0, + "completions/max_terminated_length": 45.0, + "completions/mean_length": 30.75, + "completions/mean_terminated_length": 30.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 67.12962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6389694213867188, + "kl": 0.05160256661474705, + "learning_rate": 2.125333333333333e-06, + "loss": -0.1682, + "num_tokens": 1075422.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 3625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 67.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014252202585339546, + "kl": 0.0005327500402927399, + "learning_rate": 2.125e-06, + "loss": 0.0, + "num_tokens": 1075682.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 67.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0724712610244751, + "kl": 0.019095337949693203, + "learning_rate": 2.1246666666666668e-06, + "loss": 0.001, + "num_tokens": 1076022.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 67.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11407589167356491, + "kl": 0.03594349976629019, + "learning_rate": 2.1243333333333335e-06, + "loss": 0.0018, + "num_tokens": 1076317.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 67.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06395824998617172, + "kl": 0.002608716531540267, + "learning_rate": 2.124e-06, + "loss": 0.0001, + "num_tokens": 1076573.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 67.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005554386414587498, + "kl": 0.0009240607614628971, + "learning_rate": 2.1236666666666667e-06, + "loss": 0.0, + "num_tokens": 1076885.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 67.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023439912125468254, + "kl": 0.0022011324763298035, + "learning_rate": 2.123333333333333e-06, + "loss": 0.0001, + "num_tokens": 1077101.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 67.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017596596851944923, + "kl": 0.0027563442708924413, + "learning_rate": 2.1230000000000003e-06, + "loss": 0.0001, + "num_tokens": 1077369.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 8.75, + "completions/mean_terminated_length": 8.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 67.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24695774912834167, + "kl": 0.010047714225947857, + "learning_rate": 2.122666666666667e-06, + "loss": 0.0006, + "num_tokens": 1077616.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 67.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04528961703181267, + "kl": 0.007282962556928396, + "learning_rate": 2.1223333333333334e-06, + "loss": 0.0004, + "num_tokens": 1077938.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 67.31481481481481, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.12740957736969, + "kl": 0.35478825867176056, + "learning_rate": 2.122e-06, + "loss": -0.0177, + "num_tokens": 1078306.0, + "reward": 2.25, + "reward_std": 1.443375587463379, + "rewards/reward_combined/mean": 2.25, + "rewards/reward_combined/std": 1.4433757066726685, + "step": 3635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 67.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009686710312962532, + "kl": 0.01479168375954032, + "learning_rate": 2.1216666666666665e-06, + "loss": 0.0007, + "num_tokens": 1078566.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 67.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1068182960152626, + "kl": 0.02172045409679413, + "learning_rate": 2.1213333333333333e-06, + "loss": 0.0011, + "num_tokens": 1078860.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 48.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 48.5, + "completions/mean_terminated_length": 48.5, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 67.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03950683772563934, + "kl": 0.013203125447034836, + "learning_rate": 2.121e-06, + "loss": 0.0007, + "num_tokens": 1079274.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 28.0, + "completions/mean_terminated_length": 28.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 67.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11037489771842957, + "kl": 0.05069075897336006, + "learning_rate": 2.120666666666667e-06, + "loss": 0.0024, + "num_tokens": 1079606.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 67.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07049866020679474, + "kl": 0.005110967089422047, + "learning_rate": 2.1203333333333332e-06, + "loss": 0.0003, + "num_tokens": 1079866.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 33.5, + "completions/mean_terminated_length": 33.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 67.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12705372273921967, + "kl": 0.04665055498480797, + "learning_rate": 2.12e-06, + "loss": 0.0024, + "num_tokens": 1080224.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 67.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00845647044479847, + "kl": 0.0003898218274116516, + "learning_rate": 2.119666666666667e-06, + "loss": 0.0, + "num_tokens": 1080432.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 67.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002862463006749749, + "kl": 7.816404104232788e-05, + "learning_rate": 2.119333333333333e-06, + "loss": 0.0, + "num_tokens": 1080652.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 3643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 67.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04544331878423691, + "kl": 0.006195220164954662, + "learning_rate": 2.119e-06, + "loss": 0.0003, + "num_tokens": 1080938.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 67.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.987448692321777, + "kl": 0.17379155382514, + "learning_rate": 2.1186666666666667e-06, + "loss": 0.0829, + "num_tokens": 1081272.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 3645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 67.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09284070134162903, + "kl": 0.015048619359731674, + "learning_rate": 2.1183333333333335e-06, + "loss": 0.0007, + "num_tokens": 1081600.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 67.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04682357981801033, + "kl": 0.006960212951526046, + "learning_rate": 2.118e-06, + "loss": 0.0003, + "num_tokens": 1081872.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 67.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0363360233604908, + "kl": 0.004977188538759947, + "learning_rate": 2.1176666666666667e-06, + "loss": 0.0002, + "num_tokens": 1082144.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 67.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009375547990202904, + "kl": 0.2671874761581421, + "learning_rate": 2.117333333333333e-06, + "loss": 0.0134, + "num_tokens": 1082448.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 67.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04979020357131958, + "kl": 0.005073887296020985, + "learning_rate": 2.1170000000000002e-06, + "loss": 0.0002, + "num_tokens": 1082742.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 33.5, + "completions/mean_terminated_length": 33.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 67.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.49454399943351746, + "kl": 0.09017374366521835, + "learning_rate": 2.116666666666667e-06, + "loss": 0.0046, + "num_tokens": 1083092.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 67.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008618989959359169, + "kl": 0.0005549837951548398, + "learning_rate": 2.1163333333333334e-06, + "loss": 0.0, + "num_tokens": 1083327.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 67.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1355447769165039, + "kl": 0.02079087868332863, + "learning_rate": 2.116e-06, + "loss": 0.0009, + "num_tokens": 1083622.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 67.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025502989068627357, + "kl": 0.0020838241325691342, + "learning_rate": 2.1156666666666665e-06, + "loss": 0.0001, + "num_tokens": 1083904.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 67.68518518518519, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.5850510597229, + "kl": 0.3873444255441427, + "learning_rate": 2.1153333333333333e-06, + "loss": 0.0923, + "num_tokens": 1084186.0, + "reward": 4.375, + "reward_std": 4.190763473510742, + "rewards/reward_combined/mean": 4.375, + "rewards/reward_combined/std": 4.190763473510742, + "step": 3655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 67.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0042832233011722565, + "kl": 0.00015462636656593531, + "learning_rate": 2.115e-06, + "loss": 0.0, + "num_tokens": 1084406.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 67.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11391454190015793, + "kl": 0.055850550532341, + "learning_rate": 2.114666666666667e-06, + "loss": 0.0026, + "num_tokens": 1084717.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 67.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14850559830665588, + "kl": 0.013492303434759378, + "learning_rate": 2.1143333333333332e-06, + "loss": 0.0006, + "num_tokens": 1085019.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 67.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09155723452568054, + "kl": 0.006057455786503851, + "learning_rate": 2.114e-06, + "loss": 0.0003, + "num_tokens": 1085319.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 67.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1932889074087143, + "kl": 0.007851041154935956, + "learning_rate": 2.1136666666666668e-06, + "loss": 0.0004, + "num_tokens": 1085573.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 67.79629629629629, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0203304290771484, + "kl": 0.0762731246650219, + "learning_rate": 2.113333333333333e-06, + "loss": 0.004, + "num_tokens": 1085863.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 3661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 67.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06873319298028946, + "kl": 0.014164955355226994, + "learning_rate": 2.1130000000000004e-06, + "loss": 0.0007, + "num_tokens": 1086151.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 67.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05913740023970604, + "kl": 0.022879963740706444, + "learning_rate": 2.1126666666666667e-06, + "loss": 0.0011, + "num_tokens": 1086504.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 67.85185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.191492557525635, + "kl": 0.013359299551666481, + "learning_rate": 2.1123333333333335e-06, + "loss": 0.0315, + "num_tokens": 1086778.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 3664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 67.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06391195952892303, + "kl": 0.001900363015010953, + "learning_rate": 2.112e-06, + "loss": 0.0001, + "num_tokens": 1086991.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 67.88888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2260262966156006, + "kl": 0.02756652794778347, + "learning_rate": 2.1116666666666666e-06, + "loss": 0.0012, + "num_tokens": 1087303.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 3666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 67.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05605637654662132, + "kl": 0.008502446697093546, + "learning_rate": 2.1113333333333334e-06, + "loss": 0.0004, + "num_tokens": 1087591.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 67.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23070630431175232, + "kl": 0.18261893093585968, + "learning_rate": 2.111e-06, + "loss": 0.0092, + "num_tokens": 1087901.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 67.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1883108913898468, + "kl": 0.08544060960412025, + "learning_rate": 2.110666666666667e-06, + "loss": 0.0042, + "num_tokens": 1088306.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 3669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 67.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015478103421628475, + "kl": 0.004814976826310158, + "learning_rate": 2.1103333333333333e-06, + "loss": 0.0002, + "num_tokens": 1088574.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 67.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03014935925602913, + "kl": 0.00230532290879637, + "learning_rate": 2.11e-06, + "loss": 0.0001, + "num_tokens": 1088892.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 68.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010596226900815964, + "kl": 0.008203256875276566, + "learning_rate": 2.1096666666666665e-06, + "loss": 0.0004, + "num_tokens": 1089164.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 68.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005889325402677059, + "kl": 0.0004180723044555634, + "learning_rate": 2.1093333333333333e-06, + "loss": 0.0, + "num_tokens": 1089399.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 68.03703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.897578716278076, + "kl": 0.018367459997534752, + "learning_rate": 2.109e-06, + "loss": 0.2064, + "num_tokens": 1089690.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 3674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 28.0, + "completions/mean_terminated_length": 28.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 68.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06088269129395485, + "kl": 0.023177883587777615, + "learning_rate": 2.108666666666667e-06, + "loss": 0.0011, + "num_tokens": 1090030.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 68.07407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.026470184326172, + "kl": 0.04168402776122093, + "learning_rate": 2.108333333333333e-06, + "loss": 0.0905, + "num_tokens": 1090308.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 3676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 68.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11855748295783997, + "kl": 0.009419201407581568, + "learning_rate": 2.108e-06, + "loss": 0.0005, + "num_tokens": 1090650.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 68.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025909509509801865, + "kl": 0.002303547109477222, + "learning_rate": 2.1076666666666668e-06, + "loss": 0.0001, + "num_tokens": 1090927.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 68.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025309741497039795, + "kl": 0.0025870114332064986, + "learning_rate": 2.1073333333333335e-06, + "loss": 0.0001, + "num_tokens": 1091241.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 68.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03963938355445862, + "kl": 0.0064029518398456275, + "learning_rate": 2.1070000000000003e-06, + "loss": 0.0003, + "num_tokens": 1091537.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 68.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17038275301456451, + "kl": 0.00915285013616085, + "learning_rate": 2.1066666666666667e-06, + "loss": 0.0005, + "num_tokens": 1091794.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 68.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0068563055247068405, + "kl": 0.00044394657015800476, + "learning_rate": 2.1063333333333335e-06, + "loss": 0.0, + "num_tokens": 1092054.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 68.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03563537821173668, + "kl": 0.005906453588977456, + "learning_rate": 2.106e-06, + "loss": 0.0003, + "num_tokens": 1092328.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 68.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0603560172021389, + "kl": 0.00466797745320946, + "learning_rate": 2.1056666666666666e-06, + "loss": 0.0002, + "num_tokens": 1092596.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 68.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00028345666942186654, + "kl": 7.80150294303894e-05, + "learning_rate": 2.1053333333333334e-06, + "loss": 0.0, + "num_tokens": 1092816.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 3685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 68.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04415438324213028, + "kl": 0.0013375446433201432, + "learning_rate": 2.105e-06, + "loss": 0.0001, + "num_tokens": 1093084.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 68.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010908227413892746, + "kl": 0.26693589985370636, + "learning_rate": 2.104666666666667e-06, + "loss": 0.0133, + "num_tokens": 1093388.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 35.25, + "completions/mean_terminated_length": 35.25, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 68.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08371180295944214, + "kl": 0.02724790945649147, + "learning_rate": 2.1043333333333333e-06, + "loss": 0.0014, + "num_tokens": 1093753.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 42.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 42.5, + "completions/mean_terminated_length": 42.5, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 68.31481481481481, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4664881229400635, + "kl": 0.02393741998821497, + "learning_rate": 2.104e-06, + "loss": -0.1478, + "num_tokens": 1094143.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 3689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 68.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03195560351014137, + "kl": 0.006068086950108409, + "learning_rate": 2.1036666666666665e-06, + "loss": 0.0003, + "num_tokens": 1094434.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 68.35185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.525040626525879, + "kl": 0.06875466764904559, + "learning_rate": 2.1033333333333332e-06, + "loss": 0.1545, + "num_tokens": 1094711.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 3691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 68.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3200722932815552, + "kl": 0.04628994641825557, + "learning_rate": 2.103e-06, + "loss": 0.0022, + "num_tokens": 1094980.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 68.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2970018982887268, + "kl": 0.040442612022161484, + "learning_rate": 2.102666666666667e-06, + "loss": 0.002, + "num_tokens": 1095303.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 68.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02367353066802025, + "kl": 0.0021707614650949836, + "learning_rate": 2.1023333333333336e-06, + "loss": 0.0001, + "num_tokens": 1095623.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 68.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06780219078063965, + "kl": 0.0032763570779934525, + "learning_rate": 2.102e-06, + "loss": 0.0002, + "num_tokens": 1095883.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 68.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050598856061697006, + "kl": 0.008046301547437906, + "learning_rate": 2.1016666666666667e-06, + "loss": 0.0004, + "num_tokens": 1096172.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 36.5, + "completions/mean_terminated_length": 36.5, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 68.46296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7576327323913574, + "kl": 0.062360092997550964, + "learning_rate": 2.1013333333333335e-06, + "loss": -0.0329, + "num_tokens": 1096546.0, + "reward": 6.75, + "reward_std": 2.1794495582580566, + "rewards/reward_combined/mean": 6.75, + "rewards/reward_combined/std": 2.1794495582580566, + "step": 3697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 68.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0103458222001791, + "kl": 0.014693498611450195, + "learning_rate": 2.1010000000000003e-06, + "loss": 0.0007, + "num_tokens": 1096806.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 68.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013172569684684277, + "kl": 0.00020606070756912231, + "learning_rate": 2.1006666666666667e-06, + "loss": 0.0, + "num_tokens": 1097018.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 68.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09893094748258591, + "kl": 0.015347694512456656, + "learning_rate": 2.1003333333333334e-06, + "loss": 0.0008, + "num_tokens": 1097306.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 68.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2724671959877014, + "kl": 0.02285251021385193, + "learning_rate": 2.1e-06, + "loss": 0.0011, + "num_tokens": 1097550.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 68.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11986280232667923, + "kl": 0.06006912142038345, + "learning_rate": 2.0996666666666666e-06, + "loss": 0.003, + "num_tokens": 1097955.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 3702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 68.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031555116176605225, + "kl": 0.15293735265731812, + "learning_rate": 2.0993333333333334e-06, + "loss": 0.0076, + "num_tokens": 1098265.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 68.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056920044124126434, + "kl": 0.02557973563671112, + "learning_rate": 2.099e-06, + "loss": 0.0013, + "num_tokens": 1098620.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 68.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18703508377075195, + "kl": 0.0259105428121984, + "learning_rate": 2.098666666666667e-06, + "loss": 0.0013, + "num_tokens": 1098902.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 68.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12755346298217773, + "kl": 0.009138700319454074, + "learning_rate": 2.0983333333333333e-06, + "loss": 0.0006, + "num_tokens": 1099147.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 68.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10238856077194214, + "kl": 0.035535553470253944, + "learning_rate": 2.098e-06, + "loss": 0.0018, + "num_tokens": 1099463.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 68.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027655061334371567, + "kl": 0.006079294253140688, + "learning_rate": 2.0976666666666664e-06, + "loss": 0.0003, + "num_tokens": 1099767.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 68.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004385136999189854, + "kl": 0.00014868975267745554, + "learning_rate": 2.0973333333333336e-06, + "loss": 0.0, + "num_tokens": 1099987.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 68.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08086896687746048, + "kl": 0.01650427095592022, + "learning_rate": 2.097e-06, + "loss": 0.0008, + "num_tokens": 1100293.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 68.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005045006982982159, + "kl": 0.0011046454310417175, + "learning_rate": 2.0966666666666668e-06, + "loss": 0.0001, + "num_tokens": 1100605.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 68.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014279398135840893, + "kl": 0.00030957162380218506, + "learning_rate": 2.0963333333333336e-06, + "loss": 0.0, + "num_tokens": 1100817.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 23.25, + "completions/mean_terminated_length": 23.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 68.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09553996473550797, + "kl": 0.02474972326308489, + "learning_rate": 2.096e-06, + "loss": 0.0012, + "num_tokens": 1101162.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 28.25, + "completions/mean_terminated_length": 28.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 68.77777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2830212116241455, + "kl": 0.05193134769797325, + "learning_rate": 2.0956666666666667e-06, + "loss": 0.1303, + "num_tokens": 1101511.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 3714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 68.79629629629629, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2090346813201904, + "kl": 0.09855070896446705, + "learning_rate": 2.0953333333333335e-06, + "loss": 0.0045, + "num_tokens": 1101801.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 3715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 68.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8406456112861633, + "kl": 0.07098456658422947, + "learning_rate": 2.0950000000000003e-06, + "loss": 0.0041, + "num_tokens": 1102081.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 68.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002291263546794653, + "kl": 0.0033948197960853577, + "learning_rate": 2.0946666666666666e-06, + "loss": 0.0002, + "num_tokens": 1102317.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 68.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.32667556405067444, + "kl": 0.07127051055431366, + "learning_rate": 2.0943333333333334e-06, + "loss": 0.0035, + "num_tokens": 1102658.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 68.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09607010334730148, + "kl": 0.011841509491205215, + "learning_rate": 2.0939999999999998e-06, + "loss": 0.0006, + "num_tokens": 1102960.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 68.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0636925920844078, + "kl": 0.042949215858243406, + "learning_rate": 2.0936666666666666e-06, + "loss": 0.0022, + "num_tokens": 1103326.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 68.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013012955896556377, + "kl": 0.00017833709716796875, + "learning_rate": 2.0933333333333333e-06, + "loss": 0.0, + "num_tokens": 1103538.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 68.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011885164305567741, + "kl": 0.002513908431865275, + "learning_rate": 2.093e-06, + "loss": 0.0001, + "num_tokens": 1103822.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 68.94444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.811704158782959, + "kl": 0.0468473955988884, + "learning_rate": 2.092666666666667e-06, + "loss": 0.0902, + "num_tokens": 1104134.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 3723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 68.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01888645812869072, + "kl": 0.0030528414936270565, + "learning_rate": 2.0923333333333333e-06, + "loss": 0.0001, + "num_tokens": 1104392.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 68.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24757111072540283, + "kl": 0.026431400794535875, + "learning_rate": 2.092e-06, + "loss": 0.0014, + "num_tokens": 1104668.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 69.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08265064656734467, + "kl": 0.016250974498689175, + "learning_rate": 2.0916666666666664e-06, + "loss": 0.0008, + "num_tokens": 1104988.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 69.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019190331920981407, + "kl": 0.00369901186786592, + "learning_rate": 2.0913333333333336e-06, + "loss": 0.0002, + "num_tokens": 1105260.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 10.75, + "completions/mean_terminated_length": 10.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 69.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08026011288166046, + "kl": 0.004230632446706295, + "learning_rate": 2.091e-06, + "loss": 0.0002, + "num_tokens": 1105523.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 69.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13460038602352142, + "kl": 0.020469567651161924, + "learning_rate": 2.0906666666666668e-06, + "loss": 0.0011, + "num_tokens": 1105829.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 69.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14596623182296753, + "kl": 0.029725193977355957, + "learning_rate": 2.0903333333333335e-06, + "loss": 0.0015, + "num_tokens": 1106101.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 69.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12871718406677246, + "kl": 0.06440968252718449, + "learning_rate": 2.09e-06, + "loss": 0.0032, + "num_tokens": 1106437.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 69.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009899850934743881, + "kl": 0.008623609319329262, + "learning_rate": 2.0896666666666667e-06, + "loss": 0.0004, + "num_tokens": 1106709.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 69.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0689016580581665, + "kl": 0.04338335618376732, + "learning_rate": 2.0893333333333335e-06, + "loss": 0.0022, + "num_tokens": 1107009.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 31.75, + "completions/mean_terminated_length": 31.75, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 69.14814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8067128658294678, + "kl": 0.023298457264900208, + "learning_rate": 2.0890000000000002e-06, + "loss": -0.0239, + "num_tokens": 1107360.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 3734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 42.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 42.75, + "completions/mean_terminated_length": 42.75, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 69.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16316451132297516, + "kl": 0.043815051671117544, + "learning_rate": 2.0886666666666666e-06, + "loss": 0.0018, + "num_tokens": 1107751.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 69.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08839473128318787, + "kl": 0.010210568085312843, + "learning_rate": 2.0883333333333334e-06, + "loss": 0.0005, + "num_tokens": 1108040.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 69.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09216146916151047, + "kl": 0.13382838666439056, + "learning_rate": 2.0879999999999997e-06, + "loss": 0.0064, + "num_tokens": 1108365.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 69.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07643798738718033, + "kl": 0.0196725451387465, + "learning_rate": 2.0876666666666665e-06, + "loss": 0.001, + "num_tokens": 1108700.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 69.24074074074075, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.139026165008545, + "kl": 0.032755774445831776, + "learning_rate": 2.0873333333333337e-06, + "loss": -0.0303, + "num_tokens": 1108988.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 3739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 69.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019622936844825745, + "kl": 0.0032946651335805655, + "learning_rate": 2.087e-06, + "loss": 0.0002, + "num_tokens": 1109272.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 69.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04078260809183121, + "kl": 0.004805737407878041, + "learning_rate": 2.086666666666667e-06, + "loss": 0.0002, + "num_tokens": 1109594.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 69.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15685616433620453, + "kl": 0.00503178930375725, + "learning_rate": 2.0863333333333332e-06, + "loss": 0.0003, + "num_tokens": 1109818.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 69.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15890878438949585, + "kl": 0.019384440034627914, + "learning_rate": 2.086e-06, + "loss": 0.001, + "num_tokens": 1110078.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 69.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012090597301721573, + "kl": 0.0009440481662750244, + "learning_rate": 2.085666666666667e-06, + "loss": 0.0, + "num_tokens": 1110288.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 69.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04344664514064789, + "kl": 0.009759694337844849, + "learning_rate": 2.0853333333333336e-06, + "loss": 0.0005, + "num_tokens": 1110572.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 69.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0350305549800396, + "kl": 0.011384228244423866, + "learning_rate": 2.085e-06, + "loss": 0.0006, + "num_tokens": 1110833.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 69.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.895107626914978, + "kl": 0.2670859545469284, + "learning_rate": 2.0846666666666667e-06, + "loss": 0.0134, + "num_tokens": 1111205.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 69.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01900782249867916, + "kl": 0.0014404583489522338, + "learning_rate": 2.0843333333333335e-06, + "loss": 0.0001, + "num_tokens": 1111524.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 69.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0024961652234196663, + "kl": 0.003362610936164856, + "learning_rate": 2.084e-06, + "loss": 0.0002, + "num_tokens": 1111760.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 69.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008233225904405117, + "kl": 0.001639210619032383, + "learning_rate": 2.0836666666666667e-06, + "loss": 0.0001, + "num_tokens": 1112072.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 69.46296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.388033390045166, + "kl": 0.02213809033855796, + "learning_rate": 2.0833333333333334e-06, + "loss": 0.053, + "num_tokens": 1112374.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 3751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 69.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.393929958343506, + "kl": 0.1466339000617154, + "learning_rate": 2.0830000000000002e-06, + "loss": 0.0076, + "num_tokens": 1112642.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 33.0, + "completions/mean_terminated_length": 33.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 69.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04682360589504242, + "kl": 0.04479285143315792, + "learning_rate": 2.0826666666666666e-06, + "loss": 0.0022, + "num_tokens": 1113054.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 3753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 34.75, + "completions/mean_terminated_length": 34.75, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 69.51851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.27522349357605, + "kl": 0.1270275004208088, + "learning_rate": 2.0823333333333334e-06, + "loss": -0.0048, + "num_tokens": 1113421.0, + "reward": 5.625, + "reward_std": 2.462214469909668, + "rewards/reward_combined/mean": 5.625, + "rewards/reward_combined/std": 2.462214469909668, + "step": 3754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 69.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006829011719673872, + "kl": 5.1021575927734375e-05, + "learning_rate": 2.0819999999999997e-06, + "loss": 0.0, + "num_tokens": 1113633.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 69.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14577077329158783, + "kl": 0.0053781368769705296, + "learning_rate": 2.081666666666667e-06, + "loss": 0.0004, + "num_tokens": 1113873.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 69.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09836200624704361, + "kl": 0.016962699592113495, + "learning_rate": 2.0813333333333337e-06, + "loss": 0.0008, + "num_tokens": 1114192.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 71.75, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 71.75, + "completions/mean_terminated_length": 10.333333969116211, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 69.5925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.552198886871338, + "kl": 0.35889815050177276, + "learning_rate": 2.081e-06, + "loss": 0.4608, + "num_tokens": 1114703.0, + "reward": 4.925000190734863, + "reward_std": 3.797696828842163, + "rewards/reward_combined/mean": 4.925000190734863, + "rewards/reward_combined/std": 3.797696828842163, + "step": 3758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 69.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004982537589967251, + "kl": 0.001121779263485223, + "learning_rate": 2.080666666666667e-06, + "loss": 0.0001, + "num_tokens": 1115015.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 69.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04316097870469093, + "kl": 0.007986365118995309, + "learning_rate": 2.080333333333333e-06, + "loss": 0.0004, + "num_tokens": 1115306.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 69.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014881937764585018, + "kl": 0.0033111422089859843, + "learning_rate": 2.08e-06, + "loss": 0.0002, + "num_tokens": 1115586.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 69.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030525509268045425, + "kl": 0.0018466348992660642, + "learning_rate": 2.0796666666666668e-06, + "loss": 0.0001, + "num_tokens": 1115888.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.5, + "completions/mean_terminated_length": 5.5, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 69.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06862545013427734, + "kl": 0.0027694710515788756, + "learning_rate": 2.0793333333333336e-06, + "loss": 0.0002, + "num_tokens": 1116110.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 69.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032411668449640274, + "kl": 0.0027646422386169434, + "learning_rate": 2.079e-06, + "loss": 0.0001, + "num_tokens": 1116370.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 69.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0300470981746912, + "kl": 0.0009364724101033062, + "learning_rate": 2.0786666666666667e-06, + "loss": 0.0, + "num_tokens": 1116626.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 12.25, + "completions/mean_terminated_length": 12.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 69.74074074074075, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.757248878479004, + "kl": 0.022038782015442848, + "learning_rate": 2.0783333333333335e-06, + "loss": 0.0977, + "num_tokens": 1116899.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 3766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 69.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002891722833737731, + "kl": 7.560849189758301e-05, + "learning_rate": 2.078e-06, + "loss": 0.0, + "num_tokens": 1117119.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 3767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 69.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009057600982487202, + "kl": 0.2671954482793808, + "learning_rate": 2.0776666666666666e-06, + "loss": 0.0134, + "num_tokens": 1117423.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 69.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058537062257528305, + "kl": 0.004185812082141638, + "learning_rate": 2.0773333333333334e-06, + "loss": 0.0002, + "num_tokens": 1117671.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 69.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008690671995282173, + "kl": 0.0008343184599652886, + "learning_rate": 2.077e-06, + "loss": 0.0, + "num_tokens": 1117943.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 69.83333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.12689733505249, + "kl": 0.026604359038174152, + "learning_rate": 2.0766666666666665e-06, + "loss": 0.1347, + "num_tokens": 1118256.0, + "reward": 5.0, + "reward_std": 5.0, + "rewards/reward_combined/mean": 5.0, + "rewards/reward_combined/std": 5.0, + "step": 3771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 60.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 168.0, + "completions/max_terminated_length": 168.0, + "completions/mean_length": 60.5, + "completions/mean_terminated_length": 60.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 69.85185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4342854022979736, + "kl": 0.031211985275149345, + "learning_rate": 2.0763333333333333e-06, + "loss": 0.3869, + "num_tokens": 1118714.0, + "reward": 2.799999952316284, + "reward_std": 1.399999976158142, + "rewards/reward_combined/mean": 2.799999952316284, + "rewards/reward_combined/std": 1.399999976158142, + "step": 3772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 23.75, + "completions/mean_terminated_length": 23.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 69.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13445831835269928, + "kl": 0.03361153043806553, + "learning_rate": 2.0759999999999997e-06, + "loss": 0.0017, + "num_tokens": 1119029.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 36.5, + "completions/mean_terminated_length": 36.5, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 69.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035520054399967194, + "kl": 0.015147535130381584, + "learning_rate": 2.075666666666667e-06, + "loss": 0.0008, + "num_tokens": 1119403.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 69.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04940409958362579, + "kl": 0.002904816879890859, + "learning_rate": 2.0753333333333337e-06, + "loss": 0.0001, + "num_tokens": 1119729.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 69.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08960027247667313, + "kl": 0.016207166947424412, + "learning_rate": 2.075e-06, + "loss": 0.0008, + "num_tokens": 1120027.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 69.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013842816464602947, + "kl": 0.0006515667628264055, + "learning_rate": 2.074666666666667e-06, + "loss": 0.0, + "num_tokens": 1120262.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 69.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010114590637385845, + "kl": 0.0011583297746255994, + "learning_rate": 2.074333333333333e-06, + "loss": 0.0001, + "num_tokens": 1120544.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 69.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06705860048532486, + "kl": 0.033969609066843987, + "learning_rate": 2.074e-06, + "loss": 0.0017, + "num_tokens": 1120877.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 70.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3373340666294098, + "kl": 0.02420927892671898, + "learning_rate": 2.0736666666666667e-06, + "loss": 0.0011, + "num_tokens": 1121137.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 70.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01912330836057663, + "kl": 0.003732536919414997, + "learning_rate": 2.0733333333333335e-06, + "loss": 0.0002, + "num_tokens": 1121415.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 70.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1849440634250641, + "kl": 0.030430767685174942, + "learning_rate": 2.073e-06, + "loss": 0.0015, + "num_tokens": 1121714.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 70.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04467763379216194, + "kl": 0.03808296099305153, + "learning_rate": 2.0726666666666667e-06, + "loss": 0.0019, + "num_tokens": 1122119.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 3783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 70.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17115505039691925, + "kl": 0.016103115398436785, + "learning_rate": 2.0723333333333335e-06, + "loss": 0.0008, + "num_tokens": 1122417.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 70.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009636009112000465, + "kl": 0.2670990973711014, + "learning_rate": 2.072e-06, + "loss": 0.0134, + "num_tokens": 1122721.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 27.5, + "completions/mean_terminated_length": 27.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 70.11111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.894669532775879, + "kl": 0.13417953625321388, + "learning_rate": 2.071666666666667e-06, + "loss": 0.0636, + "num_tokens": 1123059.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 3786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 70.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03819984570145607, + "kl": 0.005596211412921548, + "learning_rate": 2.0713333333333334e-06, + "loss": 0.0003, + "num_tokens": 1123332.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 70.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00319188111461699, + "kl": 0.0003730393946170807, + "learning_rate": 2.071e-06, + "loss": 0.0, + "num_tokens": 1123592.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 70.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10953184962272644, + "kl": 0.01062619686126709, + "learning_rate": 2.0706666666666665e-06, + "loss": 0.0004, + "num_tokens": 1123846.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 70.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2885751724243164, + "kl": 0.02905437909066677, + "learning_rate": 2.0703333333333333e-06, + "loss": 0.0015, + "num_tokens": 1124104.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 70.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09554505348205566, + "kl": 0.00447315294877626, + "learning_rate": 2.07e-06, + "loss": 0.0002, + "num_tokens": 1124361.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 70.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011061636731028557, + "kl": 0.01441561197862029, + "learning_rate": 2.069666666666667e-06, + "loss": 0.0007, + "num_tokens": 1124621.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 29.0, + "completions/mean_terminated_length": 29.0, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 70.24074074074075, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4297924041748047, + "kl": 0.038245189003646374, + "learning_rate": 2.0693333333333337e-06, + "loss": -0.0629, + "num_tokens": 1124965.0, + "reward": 7.625, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.625, + "rewards/reward_combined/std": 0.25, + "step": 3793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 70.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12185338884592056, + "kl": 0.0289075025357306, + "learning_rate": 2.069e-06, + "loss": 0.0015, + "num_tokens": 1125251.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 34.25, + "completions/mean_terminated_length": 34.25, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 70.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17212502658367157, + "kl": 0.039578577503561974, + "learning_rate": 2.068666666666667e-06, + "loss": 0.002, + "num_tokens": 1125612.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 70.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02968708984553814, + "kl": 0.0010114670731127262, + "learning_rate": 2.068333333333333e-06, + "loss": 0.0001, + "num_tokens": 1125886.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 70.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8477181792259216, + "kl": 0.0689125619828701, + "learning_rate": 2.068e-06, + "loss": 0.0035, + "num_tokens": 1126225.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 70.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01068786345422268, + "kl": 0.001261926256120205, + "learning_rate": 2.0676666666666667e-06, + "loss": 0.0001, + "num_tokens": 1126543.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 70.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17308853566646576, + "kl": 0.01491808972787112, + "learning_rate": 2.0673333333333335e-06, + "loss": 0.0007, + "num_tokens": 1126811.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 70.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059425778687000275, + "kl": 0.01658582268282771, + "learning_rate": 2.067e-06, + "loss": 0.0008, + "num_tokens": 1127157.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 70.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05604563653469086, + "kl": 0.007753998972475529, + "learning_rate": 2.0666666666666666e-06, + "loss": 0.0004, + "num_tokens": 1127471.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 82.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 82.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 70.4074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.353825330734253, + "kl": 0.046961236745119095, + "learning_rate": 2.0663333333333334e-06, + "loss": 0.4165, + "num_tokens": 1128035.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 3802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 70.42592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.302901268005371, + "kl": 0.003622759133577347, + "learning_rate": 2.066e-06, + "loss": 0.3528, + "num_tokens": 1128271.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 3803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 70.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06839118897914886, + "kl": 0.01825037319213152, + "learning_rate": 2.065666666666667e-06, + "loss": 0.0009, + "num_tokens": 1128564.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 70.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026509441435337067, + "kl": 0.0017391294240951538, + "learning_rate": 2.0653333333333334e-06, + "loss": 0.0001, + "num_tokens": 1128776.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 70.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00028999053756706417, + "kl": 7.423758506774902e-05, + "learning_rate": 2.065e-06, + "loss": 0.0, + "num_tokens": 1128996.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 3806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 70.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15088064968585968, + "kl": 0.01860713306814432, + "learning_rate": 2.0646666666666665e-06, + "loss": 0.0009, + "num_tokens": 1129261.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3807 + }, + { + "clip_ratio/high_max": 0.01785714365541935, + "clip_ratio/high_mean": 0.01785714365541935, + "clip_ratio/low_mean": 0.014285714365541935, + "clip_ratio/low_min": 0.014285714365541935, + "clip_ratio/region_mean": 0.032142858020961285, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 70.51851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.893303632736206, + "kl": 0.21711437217891216, + "learning_rate": 2.0643333333333333e-06, + "loss": 0.0234, + "num_tokens": 1129548.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 3808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 70.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03259415924549103, + "kl": 0.1569090113043785, + "learning_rate": 2.064e-06, + "loss": 0.0078, + "num_tokens": 1129856.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 70.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041647255420684814, + "kl": 0.005442672874778509, + "learning_rate": 2.063666666666667e-06, + "loss": 0.0003, + "num_tokens": 1130146.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 70.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03202512487769127, + "kl": 0.0002814382314682007, + "learning_rate": 2.0633333333333336e-06, + "loss": 0.0, + "num_tokens": 1130358.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 70.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0650842934846878, + "kl": 0.011250387877225876, + "learning_rate": 2.063e-06, + "loss": 0.0006, + "num_tokens": 1130626.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 70.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09915147721767426, + "kl": 0.10592405870556831, + "learning_rate": 2.0626666666666668e-06, + "loss": 0.0054, + "num_tokens": 1130996.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 70.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06355968862771988, + "kl": 0.008554394356906414, + "learning_rate": 2.062333333333333e-06, + "loss": 0.0004, + "num_tokens": 1131325.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 70.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.30297115445137024, + "kl": 0.029221629025414586, + "learning_rate": 2.062e-06, + "loss": 0.0015, + "num_tokens": 1131636.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 70.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00455054733902216, + "kl": 0.0001924872340168804, + "learning_rate": 2.0616666666666667e-06, + "loss": 0.0, + "num_tokens": 1131896.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 70.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21917401254177094, + "kl": 0.02348022977821529, + "learning_rate": 2.0613333333333335e-06, + "loss": 0.0011, + "num_tokens": 1132178.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 70.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6584258675575256, + "kl": 0.11333750560879707, + "learning_rate": 2.0610000000000003e-06, + "loss": 0.006, + "num_tokens": 1132482.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 70.72222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.976577281951904, + "kl": 0.38584525883197784, + "learning_rate": 2.0606666666666666e-06, + "loss": 0.0175, + "num_tokens": 1132804.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 3819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 30.25, + "completions/mean_terminated_length": 30.25, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 70.74074074074075, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.068915605545044, + "kl": 0.046415770426392555, + "learning_rate": 2.0603333333333334e-06, + "loss": -0.0044, + "num_tokens": 1133141.0, + "reward": 4.625, + "reward_std": 2.25, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 2.25, + "step": 3820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 23.25, + "completions/mean_terminated_length": 23.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 70.75925925925925, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9163060188293457, + "kl": 0.006486825877800584, + "learning_rate": 2.06e-06, + "loss": 0.0128, + "num_tokens": 1133478.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 3821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 70.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02635047771036625, + "kl": 0.0011291744885966182, + "learning_rate": 2.059666666666667e-06, + "loss": 0.0001, + "num_tokens": 1133713.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3822 + }, + { + "clip_ratio/high_max": 0.01785714365541935, + "clip_ratio/high_mean": 0.01785714365541935, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01785714365541935, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 70.79629629629629, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.870720863342285, + "kl": 0.05522574670612812, + "learning_rate": 2.0593333333333333e-06, + "loss": -0.0024, + "num_tokens": 1133983.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 3823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.5, + "completions/mean_terminated_length": 5.5, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 70.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09829191118478775, + "kl": 0.0031094219521037303, + "learning_rate": 2.059e-06, + "loss": 0.0002, + "num_tokens": 1134205.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 70.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16233311593532562, + "kl": 0.02425145683810115, + "learning_rate": 2.0586666666666665e-06, + "loss": 0.0013, + "num_tokens": 1134532.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 70.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008066217415034771, + "kl": 0.0011122801224701107, + "learning_rate": 2.0583333333333332e-06, + "loss": 0.0001, + "num_tokens": 1134812.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 70.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08590252697467804, + "kl": 0.002045929431915283, + "learning_rate": 2.058e-06, + "loss": 0.0001, + "num_tokens": 1135016.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 70.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009715806692838669, + "kl": 0.008551953826099634, + "learning_rate": 2.057666666666667e-06, + "loss": 0.0004, + "num_tokens": 1135288.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 70.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0020628825295716524, + "kl": 0.00343361496925354, + "learning_rate": 2.0573333333333336e-06, + "loss": 0.0002, + "num_tokens": 1135524.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 70.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05215715244412422, + "kl": 0.0027110169176012278, + "learning_rate": 2.057e-06, + "loss": 0.0001, + "num_tokens": 1135824.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 70.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003164144931361079, + "kl": 0.00020235776901245117, + "learning_rate": 2.0566666666666667e-06, + "loss": 0.0, + "num_tokens": 1136068.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 70.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07815029472112656, + "kl": 0.00721925962716341, + "learning_rate": 2.056333333333333e-06, + "loss": 0.0004, + "num_tokens": 1136358.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 40.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 40.75, + "completions/mean_terminated_length": 40.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 70.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02345367521047592, + "kl": 0.010234278626739979, + "learning_rate": 2.0560000000000003e-06, + "loss": 0.0006, + "num_tokens": 1136741.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 71.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048825547099113464, + "kl": 0.007530066650360823, + "learning_rate": 2.0556666666666667e-06, + "loss": 0.0004, + "num_tokens": 1137039.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 71.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003058707807213068, + "kl": 6.46635890007019e-05, + "learning_rate": 2.0553333333333334e-06, + "loss": 0.0, + "num_tokens": 1137259.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 3835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 71.03703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2617404460906982, + "kl": 0.03131976258009672, + "learning_rate": 2.0550000000000002e-06, + "loss": 0.0064, + "num_tokens": 1137547.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 3836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 71.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10208529233932495, + "kl": 0.00822573306504637, + "learning_rate": 2.0546666666666666e-06, + "loss": 0.0004, + "num_tokens": 1137843.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 71.07407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.862679481506348, + "kl": 0.045759277418255806, + "learning_rate": 2.0543333333333334e-06, + "loss": 0.2109, + "num_tokens": 1138128.0, + "reward": 7.0, + "reward_std": 1.0, + "rewards/reward_combined/mean": 7.0, + "rewards/reward_combined/std": 1.0, + "step": 3838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 71.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03401223570108414, + "kl": 0.005740228341892362, + "learning_rate": 2.054e-06, + "loss": 0.0003, + "num_tokens": 1138426.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 71.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04922451823949814, + "kl": 0.004779013805091381, + "learning_rate": 2.053666666666667e-06, + "loss": 0.0002, + "num_tokens": 1138722.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 71.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02833518199622631, + "kl": 0.00251924991607666, + "learning_rate": 2.0533333333333333e-06, + "loss": 0.0001, + "num_tokens": 1138934.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 71.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01930255815386772, + "kl": 0.00031400471925735474, + "learning_rate": 2.053e-06, + "loss": 0.0, + "num_tokens": 1139146.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 71.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09901589155197144, + "kl": 0.13444199413061142, + "learning_rate": 2.0526666666666664e-06, + "loss": 0.0067, + "num_tokens": 1139518.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 71.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008475382812321186, + "kl": 0.0018011517822742462, + "learning_rate": 2.0523333333333332e-06, + "loss": 0.0001, + "num_tokens": 1139830.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 71.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20135091245174408, + "kl": 0.035652560414746404, + "learning_rate": 2.052e-06, + "loss": 0.0018, + "num_tokens": 1140098.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 34.5, + "completions/mean_terminated_length": 34.5, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 71.22222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.197789192199707, + "kl": 0.16001486778259277, + "learning_rate": 2.0516666666666668e-06, + "loss": -0.0722, + "num_tokens": 1140464.0, + "reward": 6.125, + "reward_std": 3.75, + "rewards/reward_combined/mean": 6.125, + "rewards/reward_combined/std": 3.75, + "step": 3846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 71.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0030362692195922136, + "kl": 0.0003831550420727581, + "learning_rate": 2.0513333333333336e-06, + "loss": 0.0, + "num_tokens": 1140683.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 32.25, + "completions/mean_terminated_length": 32.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 71.25925925925925, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.256838083267212, + "kl": 0.05211942456662655, + "learning_rate": 2.051e-06, + "loss": 0.04, + "num_tokens": 1141092.0, + "reward": 2.375, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 1.25, + "step": 3848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 71.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016511255875229836, + "kl": 0.001533987175207585, + "learning_rate": 2.0506666666666667e-06, + "loss": 0.0001, + "num_tokens": 1141410.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 71.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04304053634405136, + "kl": 0.007785625057294965, + "learning_rate": 2.050333333333333e-06, + "loss": 0.0003, + "num_tokens": 1141666.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 73.75, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 73.75, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 71.31481481481481, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.539617657661438, + "kl": 0.03314585331827402, + "learning_rate": 2.0500000000000003e-06, + "loss": 0.4536, + "num_tokens": 1142181.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 3851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 71.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06793303787708282, + "kl": 0.025253762491047382, + "learning_rate": 2.0496666666666666e-06, + "loss": 0.0013, + "num_tokens": 1142474.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 71.35185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.908078670501709, + "kl": 0.09509957768023014, + "learning_rate": 2.0493333333333334e-06, + "loss": 0.1365, + "num_tokens": 1142769.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 3853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 71.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013990159146487713, + "kl": 0.013835089281201363, + "learning_rate": 2.049e-06, + "loss": 0.0007, + "num_tokens": 1143029.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 71.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029038339853286743, + "kl": 0.001210396527312696, + "learning_rate": 2.0486666666666666e-06, + "loss": 0.0001, + "num_tokens": 1143299.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 71.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12904158234596252, + "kl": 0.0114885657094419, + "learning_rate": 2.0483333333333333e-06, + "loss": 0.0006, + "num_tokens": 1143567.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 71.42592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.977405309677124, + "kl": 0.020577928982675076, + "learning_rate": 2.048e-06, + "loss": 0.0686, + "num_tokens": 1143905.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 3857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 71.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008668515365570784, + "kl": 0.0011933803907595575, + "learning_rate": 2.047666666666667e-06, + "loss": 0.0001, + "num_tokens": 1144182.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 71.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05257728695869446, + "kl": 0.008434228366240859, + "learning_rate": 2.0473333333333333e-06, + "loss": 0.0004, + "num_tokens": 1144509.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 71.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028920171782374382, + "kl": 0.0005989249621052295, + "learning_rate": 2.047e-06, + "loss": 0.0, + "num_tokens": 1144758.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 42.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 42.0, + "completions/mean_terminated_length": 42.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 71.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.2264093160629272, + "kl": 0.18462391011416912, + "learning_rate": 2.0466666666666664e-06, + "loss": 0.0089, + "num_tokens": 1145146.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 71.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08258006721735, + "kl": 0.01812015101313591, + "learning_rate": 2.046333333333333e-06, + "loss": 0.001, + "num_tokens": 1145435.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 71.53703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.256923198699951, + "kl": 0.7270060628652573, + "learning_rate": 2.0460000000000004e-06, + "loss": 0.0602, + "num_tokens": 1145740.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 3863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 71.55555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1236493587493896, + "kl": 0.22780824918299913, + "learning_rate": 2.0456666666666668e-06, + "loss": -0.0639, + "num_tokens": 1146048.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 3864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 71.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08658071607351303, + "kl": 0.14387647807598114, + "learning_rate": 2.0453333333333335e-06, + "loss": 0.0071, + "num_tokens": 1146365.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 71.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050202954560518265, + "kl": 0.005375309803639539, + "learning_rate": 2.045e-06, + "loss": 0.0003, + "num_tokens": 1146637.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 71.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01984328404068947, + "kl": 0.002279287320561707, + "learning_rate": 2.0446666666666667e-06, + "loss": 0.0001, + "num_tokens": 1146946.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 71.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18695181608200073, + "kl": 0.05382288992404938, + "learning_rate": 2.0443333333333335e-06, + "loss": 0.0026, + "num_tokens": 1147225.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 71.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02230338379740715, + "kl": 0.008178194984793663, + "learning_rate": 2.0440000000000003e-06, + "loss": 0.0004, + "num_tokens": 1147493.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 88.25, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 88.25, + "completions/mean_terminated_length": 32.333335876464844, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 71.66666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9161524772644043, + "kl": 0.033104510977864265, + "learning_rate": 2.0436666666666666e-06, + "loss": 0.3895, + "num_tokens": 1148070.0, + "reward": 5.300000190734863, + "reward_std": 4.399999618530273, + "rewards/reward_combined/mean": 5.300000190734863, + "rewards/reward_combined/std": 4.400000095367432, + "step": 3870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 71.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08402593433856964, + "kl": 0.0031503804493695498, + "learning_rate": 2.0433333333333334e-06, + "loss": 0.0001, + "num_tokens": 1148304.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 71.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045012056827545166, + "kl": 0.015415312722325325, + "learning_rate": 2.043e-06, + "loss": 0.0008, + "num_tokens": 1148588.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 71.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06828045099973679, + "kl": 0.0388933252543211, + "learning_rate": 2.0426666666666665e-06, + "loss": 0.0018, + "num_tokens": 1148931.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 27.0, + "completions/mean_terminated_length": 27.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 71.74074074074075, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.168416976928711, + "kl": 0.3493591845035553, + "learning_rate": 2.0423333333333333e-06, + "loss": 0.0891, + "num_tokens": 1149259.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 3874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 71.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046541862189769745, + "kl": 0.006941606290638447, + "learning_rate": 2.042e-06, + "loss": 0.0003, + "num_tokens": 1149548.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 71.77777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.799485445022583, + "kl": 0.28889428451657295, + "learning_rate": 2.041666666666667e-06, + "loss": 0.02, + "num_tokens": 1149887.0, + "reward": 3.5, + "reward_std": 2.915475845336914, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 2.915475845336914, + "step": 3876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 71.79629629629629, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.953897476196289, + "kl": 0.14878639951348305, + "learning_rate": 2.0413333333333332e-06, + "loss": 0.0078, + "num_tokens": 1150193.0, + "reward": 5.75, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 2.598076105117798, + "step": 3877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 32.75, + "completions/mean_terminated_length": 32.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 71.81481481481481, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6304640769958496, + "kl": 0.047366587445139885, + "learning_rate": 2.041e-06, + "loss": -0.0338, + "num_tokens": 1150540.0, + "reward": 4.625, + "reward_std": 2.25, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 2.25, + "step": 3878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 71.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037589848041534424, + "kl": 0.003812772105447948, + "learning_rate": 2.0406666666666664e-06, + "loss": 0.0002, + "num_tokens": 1150800.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 71.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009633529931306839, + "kl": 0.004115089774131775, + "learning_rate": 2.0403333333333336e-06, + "loss": 0.0002, + "num_tokens": 1151016.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 71.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004128021188080311, + "kl": 0.0003852955996990204, + "learning_rate": 2.0400000000000004e-06, + "loss": 0.0, + "num_tokens": 1151276.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 71.88888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.983616352081299, + "kl": 0.022872302681207657, + "learning_rate": 2.0396666666666667e-06, + "loss": -0.1267, + "num_tokens": 1151594.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 3882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 10.75, + "completions/mean_terminated_length": 10.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 71.9074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.120149612426758, + "kl": 0.023815875872969627, + "learning_rate": 2.0393333333333335e-06, + "loss": 0.3868, + "num_tokens": 1151857.0, + "reward": 2.75, + "reward_std": 2.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 2.5, + "step": 3883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 28.75, + "completions/mean_terminated_length": 28.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 71.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11076006293296814, + "kl": 0.0252806656062603, + "learning_rate": 2.039e-06, + "loss": 0.0013, + "num_tokens": 1152200.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 71.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0407119020819664, + "kl": 0.005279630655422807, + "learning_rate": 2.0386666666666667e-06, + "loss": 0.0003, + "num_tokens": 1152472.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 71.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03676833212375641, + "kl": 0.0010466799139976501, + "learning_rate": 2.0383333333333334e-06, + "loss": 0.0, + "num_tokens": 1152682.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 71.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04379789158701897, + "kl": 0.001031553721986711, + "learning_rate": 2.0380000000000002e-06, + "loss": 0.0001, + "num_tokens": 1152938.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 72.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7426738739013672, + "kl": 0.02430872805416584, + "learning_rate": 2.0376666666666666e-06, + "loss": -0.0506, + "num_tokens": 1153259.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 3888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 72.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010975892655551434, + "kl": 0.0013699769624508917, + "learning_rate": 2.0373333333333334e-06, + "loss": 0.0001, + "num_tokens": 1153579.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 72.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18640156090259552, + "kl": 0.030793381854891777, + "learning_rate": 2.037e-06, + "loss": 0.0015, + "num_tokens": 1153899.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 72.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015834120567888021, + "kl": 0.001036886496876832, + "learning_rate": 2.0366666666666665e-06, + "loss": 0.0001, + "num_tokens": 1154195.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 72.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03269888833165169, + "kl": 0.0013617835938930511, + "learning_rate": 2.0363333333333333e-06, + "loss": 0.0001, + "num_tokens": 1154455.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 72.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13884000480175018, + "kl": 0.020147479372099042, + "learning_rate": 2.036e-06, + "loss": 0.0006, + "num_tokens": 1154709.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 72.11111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9379022121429443, + "kl": 0.08437211811542511, + "learning_rate": 2.035666666666667e-06, + "loss": 0.0443, + "num_tokens": 1155053.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 3894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 72.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018545180791988969, + "kl": 0.003517255187034607, + "learning_rate": 2.0353333333333332e-06, + "loss": 0.0002, + "num_tokens": 1155289.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 72.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24216555058956146, + "kl": 0.10458581149578094, + "learning_rate": 2.035e-06, + "loss": 0.0053, + "num_tokens": 1155659.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 72.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12269569933414459, + "kl": 0.10996821895241737, + "learning_rate": 2.0346666666666664e-06, + "loss": 0.0054, + "num_tokens": 1155969.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 72.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02984246052801609, + "kl": 0.006525772274471819, + "learning_rate": 2.0343333333333336e-06, + "loss": 0.0003, + "num_tokens": 1156257.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 72.20370370370371, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6496450901031494, + "kl": 0.012791088549420238, + "learning_rate": 2.0340000000000003e-06, + "loss": -0.0001, + "num_tokens": 1156557.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 3899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 72.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6115683913230896, + "kl": 0.08949461579322815, + "learning_rate": 2.0336666666666667e-06, + "loss": 0.0047, + "num_tokens": 1156860.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 32.5, + "completions/mean_terminated_length": 32.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 72.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.44598016142845154, + "kl": 0.0961623266339302, + "learning_rate": 2.0333333333333335e-06, + "loss": 0.0049, + "num_tokens": 1157214.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 72.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1155848354101181, + "kl": 0.010166168678551912, + "learning_rate": 2.033e-06, + "loss": 0.0005, + "num_tokens": 1157510.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 72.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019726382568478584, + "kl": 0.041400933638215065, + "learning_rate": 2.0326666666666666e-06, + "loss": 0.0021, + "num_tokens": 1157800.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 72.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004721549805253744, + "kl": 0.00033188238739967346, + "learning_rate": 2.0323333333333334e-06, + "loss": 0.0, + "num_tokens": 1158044.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 50.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 50.75, + "completions/mean_terminated_length": 50.75, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 72.31481481481481, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.119271993637085, + "kl": 0.05787679785862565, + "learning_rate": 2.032e-06, + "loss": 0.0532, + "num_tokens": 1158467.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 3905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 72.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07601617276668549, + "kl": 0.04474889859557152, + "learning_rate": 2.0316666666666666e-06, + "loss": 0.002, + "num_tokens": 1158813.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 72.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009916847571730614, + "kl": 0.004140004515647888, + "learning_rate": 2.0313333333333333e-06, + "loss": 0.0002, + "num_tokens": 1159029.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 28.25, + "completions/mean_terminated_length": 28.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 72.37037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4983761310577393, + "kl": 0.08018971979618073, + "learning_rate": 2.031e-06, + "loss": -0.1516, + "num_tokens": 1159370.0, + "reward": 7.75, + "reward_std": 0.28867512941360474, + "rewards/reward_combined/mean": 7.75, + "rewards/reward_combined/std": 0.28867512941360474, + "step": 3908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 72.38888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6229984760284424, + "kl": 0.27881201915442944, + "learning_rate": 2.0306666666666665e-06, + "loss": -0.051, + "num_tokens": 1159669.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 3909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 72.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23746268451213837, + "kl": 0.05401692911982536, + "learning_rate": 2.0303333333333337e-06, + "loss": 0.0027, + "num_tokens": 1159965.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 72.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05387220159173012, + "kl": 0.005998404783895239, + "learning_rate": 2.03e-06, + "loss": 0.0003, + "num_tokens": 1160237.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 72.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009281245060265064, + "kl": 0.04096512123942375, + "learning_rate": 2.029666666666667e-06, + "loss": 0.002, + "num_tokens": 1160642.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 3912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 72.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006973665440455079, + "kl": 0.0011710290564224124, + "learning_rate": 2.029333333333333e-06, + "loss": 0.0001, + "num_tokens": 1160922.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 72.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033946286886930466, + "kl": 0.0010177769872825593, + "learning_rate": 2.029e-06, + "loss": 0.0001, + "num_tokens": 1161156.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 72.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.4086551666259766, + "kl": 0.17122620344161987, + "learning_rate": 2.0286666666666668e-06, + "loss": 0.0082, + "num_tokens": 1161426.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 26.25, + "completions/mean_terminated_length": 26.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 72.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10274406522512436, + "kl": 0.03523481450974941, + "learning_rate": 2.0283333333333335e-06, + "loss": 0.0018, + "num_tokens": 1161751.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 72.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002880946849472821, + "kl": 7.771700620651245e-05, + "learning_rate": 2.0280000000000003e-06, + "loss": 0.0, + "num_tokens": 1161971.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 3917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 72.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07822265475988388, + "kl": 0.0010858774185180664, + "learning_rate": 2.0276666666666667e-06, + "loss": 0.0001, + "num_tokens": 1162183.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 72.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009367442689836025, + "kl": 0.0009228115668520331, + "learning_rate": 2.0273333333333335e-06, + "loss": 0.0, + "num_tokens": 1162455.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 72.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02175847254693508, + "kl": 0.003536662319675088, + "learning_rate": 2.027e-06, + "loss": 0.0002, + "num_tokens": 1162739.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 72.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.756463050842285, + "kl": 0.026980872498825192, + "learning_rate": 2.0266666666666666e-06, + "loss": -0.0226, + "num_tokens": 1163069.0, + "reward": 6.75, + "reward_std": 2.5, + "rewards/reward_combined/mean": 6.75, + "rewards/reward_combined/std": 2.5, + "step": 3921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 72.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05325916409492493, + "kl": 0.011910397559404373, + "learning_rate": 2.0263333333333334e-06, + "loss": 0.0006, + "num_tokens": 1163396.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.25, + "completions/mean_terminated_length": 5.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 72.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04871179163455963, + "kl": 0.0012043233145959675, + "learning_rate": 2.026e-06, + "loss": 0.0001, + "num_tokens": 1163617.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 72.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056555528193712234, + "kl": 0.0010234564542770386, + "learning_rate": 2.0256666666666665e-06, + "loss": 0.0001, + "num_tokens": 1163829.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 72.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06023001670837402, + "kl": 0.039688965305686, + "learning_rate": 2.0253333333333333e-06, + "loss": 0.002, + "num_tokens": 1164101.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 72.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008078550919890404, + "kl": 0.009403749369084835, + "learning_rate": 2.025e-06, + "loss": 0.0005, + "num_tokens": 1164373.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 27.0, + "completions/mean_terminated_length": 27.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 72.72222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.248742580413818, + "kl": 0.12861930206418037, + "learning_rate": 2.024666666666667e-06, + "loss": 0.0706, + "num_tokens": 1164717.0, + "reward": 5.25, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 4.5, + "step": 3927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 72.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05203109234571457, + "kl": 0.007720179157331586, + "learning_rate": 2.0243333333333337e-06, + "loss": 0.0004, + "num_tokens": 1164985.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 72.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02307802066206932, + "kl": 0.0005348950799088925, + "learning_rate": 2.024e-06, + "loss": 0.0, + "num_tokens": 1165241.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 72.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18737901747226715, + "kl": 0.0535675473511219, + "learning_rate": 2.023666666666667e-06, + "loss": 0.0027, + "num_tokens": 1165531.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 72.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0984058827161789, + "kl": 0.011192459613084793, + "learning_rate": 2.023333333333333e-06, + "loss": 0.0006, + "num_tokens": 1165836.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 72.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030288174748420715, + "kl": 0.0008908142626751214, + "learning_rate": 2.023e-06, + "loss": 0.0, + "num_tokens": 1166100.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 72.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13756528496742249, + "kl": 0.03603429440408945, + "learning_rate": 2.0226666666666667e-06, + "loss": 0.0019, + "num_tokens": 1166428.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 72.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004450445994734764, + "kl": 0.0013517257175408304, + "learning_rate": 2.0223333333333335e-06, + "loss": 0.0001, + "num_tokens": 1166740.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 72.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014697101898491383, + "kl": 0.003958642482757568, + "learning_rate": 2.0220000000000003e-06, + "loss": 0.0002, + "num_tokens": 1167020.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 96.5, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 47.0, + "completions/mean_length": 96.5, + "completions/mean_terminated_length": 43.333335876464844, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 72.88888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1491636037826538, + "kl": 0.040199367329478264, + "learning_rate": 2.0216666666666667e-06, + "loss": 0.4241, + "num_tokens": 1167622.0, + "reward": 1.0, + "reward_std": 2.041241407394409, + "rewards/reward_combined/mean": 1.0, + "rewards/reward_combined/std": 2.0412416458129883, + "step": 3936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 72.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036026231944561005, + "kl": 0.001046881079673767, + "learning_rate": 2.0213333333333334e-06, + "loss": 0.0001, + "num_tokens": 1167838.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 72.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0066398922353982925, + "kl": 0.0015978366136550903, + "learning_rate": 2.021e-06, + "loss": 0.0001, + "num_tokens": 1168150.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 72.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.28849008679389954, + "kl": 0.03843085467815399, + "learning_rate": 2.0206666666666666e-06, + "loss": 0.0026, + "num_tokens": 1168460.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 8.75, + "completions/mean_terminated_length": 8.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 72.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1604706346988678, + "kl": 0.024443178437650204, + "learning_rate": 2.0203333333333334e-06, + "loss": 0.0012, + "num_tokens": 1168719.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 72.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00877202209085226, + "kl": 0.26718881726264954, + "learning_rate": 2.02e-06, + "loss": 0.0134, + "num_tokens": 1169023.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 73.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.530936241149902, + "kl": 0.05582649423740804, + "learning_rate": 2.019666666666667e-06, + "loss": 0.0113, + "num_tokens": 1169289.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 3942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 32.5, + "completions/mean_terminated_length": 32.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 73.01851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4480419158935547, + "kl": 0.04963378421962261, + "learning_rate": 2.0193333333333333e-06, + "loss": 0.2837, + "num_tokens": 1169643.0, + "reward": 5.875, + "reward_std": 3.25, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 3.25, + "step": 3943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 73.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06555754691362381, + "kl": 0.004164560232311487, + "learning_rate": 2.019e-06, + "loss": 0.0002, + "num_tokens": 1169913.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 73.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0709647685289383, + "kl": 0.0010515674948692322, + "learning_rate": 2.018666666666667e-06, + "loss": 0.0001, + "num_tokens": 1170125.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 73.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11483173817396164, + "kl": 0.016486276872456074, + "learning_rate": 2.0183333333333336e-06, + "loss": 0.0008, + "num_tokens": 1170427.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 73.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010684136301279068, + "kl": 0.00032773341808933765, + "learning_rate": 2.018e-06, + "loss": 0.0, + "num_tokens": 1170695.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 73.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05566558614373207, + "kl": 0.020335861947387457, + "learning_rate": 2.0176666666666668e-06, + "loss": 0.001, + "num_tokens": 1170983.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 73.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24540309607982635, + "kl": 0.018168319948017597, + "learning_rate": 2.017333333333333e-06, + "loss": 0.0009, + "num_tokens": 1171332.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 73.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04036034643650055, + "kl": 0.0038348076632246375, + "learning_rate": 2.017e-06, + "loss": 0.0002, + "num_tokens": 1171598.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 73.16666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2119781970977783, + "kl": 0.0839585941284895, + "learning_rate": 2.0166666666666667e-06, + "loss": 0.0071, + "num_tokens": 1171902.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 3951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 73.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20628385245800018, + "kl": 0.030969264917075634, + "learning_rate": 2.0163333333333335e-06, + "loss": 0.0018, + "num_tokens": 1172170.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 73.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038065794855356216, + "kl": 0.00194305187324062, + "learning_rate": 2.0160000000000003e-06, + "loss": 0.0001, + "num_tokens": 1172430.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 73.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1545465886592865, + "kl": 0.03250173863489181, + "learning_rate": 2.0156666666666666e-06, + "loss": 0.0016, + "num_tokens": 1172740.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 73.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026166506111621857, + "kl": 0.04374006390571594, + "learning_rate": 2.0153333333333334e-06, + "loss": 0.0022, + "num_tokens": 1173144.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 3955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 73.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02362995222210884, + "kl": 0.0029923035763204098, + "learning_rate": 2.0149999999999998e-06, + "loss": 0.0001, + "num_tokens": 1173440.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 29.75, + "completions/mean_terminated_length": 29.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 73.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6023633480072021, + "kl": 0.0822620838880539, + "learning_rate": 2.014666666666667e-06, + "loss": 0.0045, + "num_tokens": 1173811.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 11.75, + "completions/mean_terminated_length": 11.75, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 73.29629629629629, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.950230598449707, + "kl": 0.11748326430097222, + "learning_rate": 2.0143333333333333e-06, + "loss": -0.04, + "num_tokens": 1174082.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 3958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.0, + "completions/max_terminated_length": 46.0, + "completions/mean_length": 34.25, + "completions/mean_terminated_length": 34.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 73.31481481481481, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6445791721343994, + "kl": 0.08978034928441048, + "learning_rate": 2.014e-06, + "loss": 0.064, + "num_tokens": 1174435.0, + "reward": 3.5, + "reward_std": 3.674234628677368, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 3.674234628677368, + "step": 3959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 73.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06531712412834167, + "kl": 0.004923277534544468, + "learning_rate": 2.013666666666667e-06, + "loss": 0.0002, + "num_tokens": 1174747.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 73.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006083305459469557, + "kl": 0.00038892030715942383, + "learning_rate": 2.0133333333333333e-06, + "loss": 0.0, + "num_tokens": 1175007.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 73.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016587592661380768, + "kl": 0.013436194974929094, + "learning_rate": 2.013e-06, + "loss": 0.0007, + "num_tokens": 1175267.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 73.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041599780321121216, + "kl": 0.007704405812546611, + "learning_rate": 2.012666666666667e-06, + "loss": 0.0004, + "num_tokens": 1175556.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 73.4074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.49184513092041, + "kl": 0.013620304875075817, + "learning_rate": 2.0123333333333336e-06, + "loss": 0.0268, + "num_tokens": 1175880.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 3964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 73.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005873010493814945, + "kl": 0.0004627754387911409, + "learning_rate": 2.012e-06, + "loss": 0.0, + "num_tokens": 1176096.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 73.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05824138596653938, + "kl": 0.008377837482839823, + "learning_rate": 2.0116666666666667e-06, + "loss": 0.0004, + "num_tokens": 1176385.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 73.46296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.315967082977295, + "kl": 0.13658245280385017, + "learning_rate": 2.011333333333333e-06, + "loss": 0.0387, + "num_tokens": 1176695.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 3967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 36.25, + "completions/mean_terminated_length": 36.25, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 73.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08227457851171494, + "kl": 0.03622059337794781, + "learning_rate": 2.011e-06, + "loss": 0.0019, + "num_tokens": 1177064.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 73.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026277078315615654, + "kl": 0.007054821355268359, + "learning_rate": 2.0106666666666667e-06, + "loss": 0.0004, + "num_tokens": 1177366.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 73.51851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.179153561592102, + "kl": 0.26764756441116333, + "learning_rate": 2.0103333333333335e-06, + "loss": 0.0132, + "num_tokens": 1177670.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 3970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 73.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09660425037145615, + "kl": 0.026360459625720978, + "learning_rate": 2.0100000000000002e-06, + "loss": 0.0014, + "num_tokens": 1177968.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 73.55555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.040585517883301, + "kl": 0.07141377031803131, + "learning_rate": 2.0096666666666666e-06, + "loss": -0.0769, + "num_tokens": 1178254.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 3972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 73.57407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.465936660766602, + "kl": 0.045494720339775085, + "learning_rate": 2.0093333333333334e-06, + "loss": 0.2404, + "num_tokens": 1178549.0, + "reward": 7.125, + "reward_std": 0.75, + "rewards/reward_combined/mean": 7.125, + "rewards/reward_combined/std": 0.75, + "step": 3973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 73.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00247369147837162, + "kl": 0.0004306994378566742, + "learning_rate": 2.0089999999999997e-06, + "loss": 0.0, + "num_tokens": 1178809.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 73.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17805567383766174, + "kl": 0.04457160085439682, + "learning_rate": 2.008666666666667e-06, + "loss": 0.0023, + "num_tokens": 1179135.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 73.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19030536711215973, + "kl": 0.049640243873000145, + "learning_rate": 2.0083333333333333e-06, + "loss": 0.0023, + "num_tokens": 1179419.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 73.64814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.577756881713867, + "kl": 0.06967569701373577, + "learning_rate": 2.008e-06, + "loss": 0.0198, + "num_tokens": 1179748.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 3977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 73.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18938206136226654, + "kl": 0.0217663012444973, + "learning_rate": 2.007666666666667e-06, + "loss": 0.0011, + "num_tokens": 1180021.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 41.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 41.25, + "completions/mean_terminated_length": 41.25, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 73.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046438369899988174, + "kl": 0.024857956916093826, + "learning_rate": 2.0073333333333332e-06, + "loss": 0.0011, + "num_tokens": 1180406.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 73.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024009445682168007, + "kl": 0.0034509622491896152, + "learning_rate": 2.007e-06, + "loss": 0.0002, + "num_tokens": 1180690.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 73.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012450838461518288, + "kl": 0.002151109278202057, + "learning_rate": 2.006666666666667e-06, + "loss": 0.0001, + "num_tokens": 1181002.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 73.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045024920254945755, + "kl": 0.00181741124833934, + "learning_rate": 2.0063333333333336e-06, + "loss": 0.0001, + "num_tokens": 1181258.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 73.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01772375963628292, + "kl": 0.0005734023579861969, + "learning_rate": 2.006e-06, + "loss": 0.0, + "num_tokens": 1181493.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 73.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004102497827261686, + "kl": 0.00026682019233703613, + "learning_rate": 2.0056666666666667e-06, + "loss": 0.0, + "num_tokens": 1181737.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.5, + "completions/mean_terminated_length": 4.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 73.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03698328137397766, + "kl": 0.0018593758286442608, + "learning_rate": 2.005333333333333e-06, + "loss": 0.0001, + "num_tokens": 1181955.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 73.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16760686039924622, + "kl": 0.14208679646253586, + "learning_rate": 2.005e-06, + "loss": 0.0071, + "num_tokens": 1182327.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 3986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 73.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002071453956887126, + "kl": 0.0034464672207832336, + "learning_rate": 2.004666666666667e-06, + "loss": 0.0002, + "num_tokens": 1182563.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 73.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052842941135168076, + "kl": 0.0025670191971585155, + "learning_rate": 2.0043333333333334e-06, + "loss": 0.0001, + "num_tokens": 1182894.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 73.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00026878848439082503, + "kl": 8.477270603179932e-05, + "learning_rate": 2.004e-06, + "loss": 0.0, + "num_tokens": 1183114.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 3989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 73.88888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2190403938293457, + "kl": 0.11757975816726685, + "learning_rate": 2.0036666666666666e-06, + "loss": -0.0726, + "num_tokens": 1183456.0, + "reward": 7.625, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.625, + "rewards/reward_combined/std": 0.25, + "step": 3990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 73.9074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 14.3524808883667, + "kl": 0.014463717117905617, + "learning_rate": 2.0033333333333334e-06, + "loss": 0.1639, + "num_tokens": 1183675.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 3991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.25, + "completions/mean_terminated_length": 3.25, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 73.92592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 21.022397994995117, + "kl": 0.05136639624834061, + "learning_rate": 2.003e-06, + "loss": 0.0811, + "num_tokens": 1183884.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 3992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 23.25, + "completions/mean_terminated_length": 23.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 73.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.28949376940727234, + "kl": 0.056278922595083714, + "learning_rate": 2.002666666666667e-06, + "loss": 0.003, + "num_tokens": 1184197.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 73.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004327712522353977, + "kl": 0.0012162349303252995, + "learning_rate": 2.0023333333333333e-06, + "loss": 0.0001, + "num_tokens": 1184477.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 73.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04026157408952713, + "kl": 0.0052518503507599235, + "learning_rate": 2.002e-06, + "loss": 0.0002, + "num_tokens": 1184751.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 3995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 34.5, + "completions/mean_terminated_length": 34.5, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 74.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0968830585479736, + "kl": 0.08913804963231087, + "learning_rate": 2.001666666666667e-06, + "loss": -0.0319, + "num_tokens": 1185117.0, + "reward": 6.0, + "reward_std": 3.674234628677368, + "rewards/reward_combined/mean": 6.0, + "rewards/reward_combined/std": 3.674234628677368, + "step": 3996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 74.01851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.448393821716309, + "kl": 0.24949227273464203, + "learning_rate": 2.001333333333333e-06, + "loss": 0.0291, + "num_tokens": 1185427.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 3997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.25, + "completions/mean_terminated_length": 5.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 74.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07311049848794937, + "kl": 0.0031819441937841475, + "learning_rate": 2.001e-06, + "loss": 0.0002, + "num_tokens": 1185648.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 3998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 74.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19473688304424286, + "kl": 0.027633660472929478, + "learning_rate": 2.0006666666666668e-06, + "loss": 0.0012, + "num_tokens": 1185940.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 3999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 74.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12668731808662415, + "kl": 0.018358412198722363, + "learning_rate": 2.0003333333333336e-06, + "loss": 0.0009, + "num_tokens": 1186270.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 74.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002034256234765053, + "kl": 0.0034529119729995728, + "learning_rate": 2e-06, + "loss": 0.0002, + "num_tokens": 1186506.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 74.11111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.552746534347534, + "kl": 0.5999440615996718, + "learning_rate": 1.9996666666666667e-06, + "loss": 0.0618, + "num_tokens": 1186767.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 4002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 74.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4583177864551544, + "kl": 0.0795559398829937, + "learning_rate": 1.999333333333333e-06, + "loss": 0.0041, + "num_tokens": 1187063.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 74.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002645017229951918, + "kl": 8.767843246459961e-05, + "learning_rate": 1.9990000000000003e-06, + "loss": 0.0, + "num_tokens": 1187283.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 4004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 35.75, + "completions/mean_terminated_length": 35.75, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 74.16666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.329433441162109, + "kl": 0.11925218999385834, + "learning_rate": 1.998666666666667e-06, + "loss": -0.0591, + "num_tokens": 1187642.0, + "reward": 4.0, + "reward_std": 2.915475845336914, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 2.915475845336914, + "step": 4005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 74.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15682768821716309, + "kl": 0.10793199576437473, + "learning_rate": 1.9983333333333334e-06, + "loss": 0.0054, + "num_tokens": 1188012.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 41.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.0, + "completions/max_terminated_length": 46.0, + "completions/mean_length": 41.5, + "completions/mean_terminated_length": 41.5, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 74.20370370370371, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.13586688041687, + "kl": 0.18894056230783463, + "learning_rate": 1.998e-06, + "loss": 0.0798, + "num_tokens": 1188406.0, + "reward": 6.375, + "reward_std": 2.136000871658325, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.136000871658325, + "step": 4007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 74.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008215570822358131, + "kl": 0.001679474487900734, + "learning_rate": 1.9976666666666665e-06, + "loss": 0.0001, + "num_tokens": 1188718.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 74.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16053363680839539, + "kl": 0.02550918608903885, + "learning_rate": 1.9973333333333333e-06, + "loss": 0.0013, + "num_tokens": 1189006.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 74.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09577256441116333, + "kl": 0.02533774357289076, + "learning_rate": 1.997e-06, + "loss": 0.0012, + "num_tokens": 1189279.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 74.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022658009082078934, + "kl": 0.0008329413831233978, + "learning_rate": 1.996666666666667e-06, + "loss": 0.0, + "num_tokens": 1189539.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 74.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01015686709433794, + "kl": 0.0028411494567990303, + "learning_rate": 1.9963333333333332e-06, + "loss": 0.0001, + "num_tokens": 1189823.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 33.0, + "completions/mean_terminated_length": 33.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 74.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0235056821256876, + "kl": 0.043126437813043594, + "learning_rate": 1.996e-06, + "loss": 0.0021, + "num_tokens": 1190235.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 4013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 74.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04940566420555115, + "kl": 0.002448553335852921, + "learning_rate": 1.995666666666667e-06, + "loss": 0.0001, + "num_tokens": 1190531.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 74.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17772062122821808, + "kl": 0.037724267691373825, + "learning_rate": 1.995333333333333e-06, + "loss": 0.002, + "num_tokens": 1190819.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 74.37037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.137699127197266, + "kl": 0.2866111099720001, + "learning_rate": 1.995e-06, + "loss": 0.0304, + "num_tokens": 1191124.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 4016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 74.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029567595571279526, + "kl": 0.00426570326089859, + "learning_rate": 1.9946666666666667e-06, + "loss": 0.0002, + "num_tokens": 1191414.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 74.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20623791217803955, + "kl": 0.026099749375134706, + "learning_rate": 1.9943333333333335e-06, + "loss": 0.0013, + "num_tokens": 1191714.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 74.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06797938048839569, + "kl": 0.005014072841731831, + "learning_rate": 1.994e-06, + "loss": 0.0003, + "num_tokens": 1192057.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 74.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027485482394695282, + "kl": 0.0048094624653458595, + "learning_rate": 1.9936666666666667e-06, + "loss": 0.0002, + "num_tokens": 1192377.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 74.46296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6406961679458618, + "kl": 0.2471523080021143, + "learning_rate": 1.993333333333333e-06, + "loss": -0.0213, + "num_tokens": 1192677.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 4021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 2.5, + "completions/mean_terminated_length": 2.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 74.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07411373406648636, + "kl": 0.0016750767827033997, + "learning_rate": 1.9930000000000002e-06, + "loss": 0.0001, + "num_tokens": 1192883.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 74.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.155965328216553, + "kl": 0.06356980884447694, + "learning_rate": 1.992666666666667e-06, + "loss": 0.1141, + "num_tokens": 1193213.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 4023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 74.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015834394842386246, + "kl": 0.0009228939306922257, + "learning_rate": 1.9923333333333334e-06, + "loss": 0.0, + "num_tokens": 1193481.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 74.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050818849354982376, + "kl": 0.004813584499061108, + "learning_rate": 1.992e-06, + "loss": 0.0002, + "num_tokens": 1193741.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 74.55555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.675247192382812, + "kl": 0.023868614342063665, + "learning_rate": 1.9916666666666665e-06, + "loss": 0.066, + "num_tokens": 1194017.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 4026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 47.0, + "completions/max_terminated_length": 47.0, + "completions/mean_length": 34.0, + "completions/mean_terminated_length": 34.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 74.57407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.288616180419922, + "kl": 0.06694895215332508, + "learning_rate": 1.9913333333333333e-06, + "loss": 0.1487, + "num_tokens": 1194405.0, + "reward": 5.875, + "reward_std": 3.25, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 3.25, + "step": 4027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 74.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034000832587480545, + "kl": 0.0009955097339116037, + "learning_rate": 1.991e-06, + "loss": 0.0, + "num_tokens": 1194640.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 32.25, + "completions/mean_terminated_length": 32.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 74.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11053633689880371, + "kl": 0.031633369624614716, + "learning_rate": 1.990666666666667e-06, + "loss": 0.0016, + "num_tokens": 1194993.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 74.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06636285781860352, + "kl": 0.004719011951237917, + "learning_rate": 1.9903333333333332e-06, + "loss": 0.0002, + "num_tokens": 1195236.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 74.64814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4540215730667114, + "kl": 0.06451453268527985, + "learning_rate": 1.99e-06, + "loss": 0.0227, + "num_tokens": 1195570.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 4031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 74.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013661684468388557, + "kl": 0.0012496738927438855, + "learning_rate": 1.9896666666666668e-06, + "loss": 0.0001, + "num_tokens": 1195830.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 74.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016131972894072533, + "kl": 0.0032341000624001026, + "learning_rate": 1.989333333333333e-06, + "loss": 0.0002, + "num_tokens": 1196110.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 74.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028665903955698013, + "kl": 0.0018889158964157104, + "learning_rate": 1.9890000000000004e-06, + "loss": 0.0001, + "num_tokens": 1196322.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 74.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.062368400394916534, + "kl": 0.00603932049125433, + "learning_rate": 1.9886666666666667e-06, + "loss": 0.0003, + "num_tokens": 1196588.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 74.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00036966640618629754, + "kl": 0.001296249101869762, + "learning_rate": 1.9883333333333335e-06, + "loss": 0.0001, + "num_tokens": 1196865.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 74.75925925925925, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.538760662078857, + "kl": 0.012847235600929707, + "learning_rate": 1.988e-06, + "loss": 0.0291, + "num_tokens": 1197126.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 4037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 74.77777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.997783660888672, + "kl": 0.029787374660372734, + "learning_rate": 1.9876666666666666e-06, + "loss": 0.258, + "num_tokens": 1197426.0, + "reward": 5.875, + "reward_std": 3.25, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 3.25, + "step": 4038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 74.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06280207633972168, + "kl": 0.015097802504897118, + "learning_rate": 1.9873333333333334e-06, + "loss": 0.0008, + "num_tokens": 1197723.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 74.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2232995331287384, + "kl": 0.010640449821949005, + "learning_rate": 1.987e-06, + "loss": 0.0006, + "num_tokens": 1197936.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 74.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016356471925973892, + "kl": 0.0002499848706065677, + "learning_rate": 1.986666666666667e-06, + "loss": 0.0, + "num_tokens": 1198192.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 74.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08152364194393158, + "kl": 0.015393751673400402, + "learning_rate": 1.9863333333333333e-06, + "loss": 0.0008, + "num_tokens": 1198481.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 74.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12052378803491592, + "kl": 0.006488578743301332, + "learning_rate": 1.986e-06, + "loss": 0.0003, + "num_tokens": 1198749.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 74.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04822550714015961, + "kl": 0.012901182286441326, + "learning_rate": 1.9856666666666665e-06, + "loss": 0.0006, + "num_tokens": 1199083.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 74.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14844229817390442, + "kl": 0.0360508244484663, + "learning_rate": 1.9853333333333333e-06, + "loss": 0.0018, + "num_tokens": 1199409.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 74.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04989173635840416, + "kl": 0.027683653868734837, + "learning_rate": 1.985e-06, + "loss": 0.0014, + "num_tokens": 1199725.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 48.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 48.25, + "completions/mean_terminated_length": 48.25, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 74.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05343243107199669, + "kl": 0.015758017543703318, + "learning_rate": 1.984666666666667e-06, + "loss": 0.0008, + "num_tokens": 1200138.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 74.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1706589311361313, + "kl": 0.019260598346590996, + "learning_rate": 1.984333333333333e-06, + "loss": 0.001, + "num_tokens": 1200406.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 74.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0880153700709343, + "kl": 0.004230510909110308, + "learning_rate": 1.984e-06, + "loss": 0.0002, + "num_tokens": 1200718.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 75.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010300721041858196, + "kl": 0.004091762006282806, + "learning_rate": 1.9836666666666668e-06, + "loss": 0.0002, + "num_tokens": 1200934.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 75.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07099296897649765, + "kl": 0.010204406222328544, + "learning_rate": 1.983333333333333e-06, + "loss": 0.0005, + "num_tokens": 1201229.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 75.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0166774932295084, + "kl": 0.0005081444978713989, + "learning_rate": 1.9830000000000003e-06, + "loss": 0.0, + "num_tokens": 1201441.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 75.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041643235832452774, + "kl": 0.013038936536759138, + "learning_rate": 1.9826666666666667e-06, + "loss": 0.0007, + "num_tokens": 1201729.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 75.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08384043723344803, + "kl": 0.004945088003296405, + "learning_rate": 1.9823333333333335e-06, + "loss": 0.0003, + "num_tokens": 1201991.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 75.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08956126868724823, + "kl": 0.016426128335297108, + "learning_rate": 1.982e-06, + "loss": 0.0009, + "num_tokens": 1202273.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 75.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09401659667491913, + "kl": 0.0030057430267333984, + "learning_rate": 1.9816666666666666e-06, + "loss": 0.0002, + "num_tokens": 1202485.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 33.0, + "completions/mean_terminated_length": 33.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 75.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06951751559972763, + "kl": 0.03467301279306412, + "learning_rate": 1.9813333333333334e-06, + "loss": 0.0012, + "num_tokens": 1202837.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 75.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03010636568069458, + "kl": 0.04167941212654114, + "learning_rate": 1.981e-06, + "loss": 0.0021, + "num_tokens": 1203128.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 75.16666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1628406047821045, + "kl": 0.004340556683018804, + "learning_rate": 1.980666666666667e-06, + "loss": 0.0197, + "num_tokens": 1203418.0, + "reward": 6.125, + "reward_std": 3.75, + "rewards/reward_combined/mean": 6.125, + "rewards/reward_combined/std": 3.75, + "step": 4059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 38.25, + "completions/mean_terminated_length": 38.25, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 75.18518518518519, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.766680121421814, + "kl": 0.185578390955925, + "learning_rate": 1.9803333333333333e-06, + "loss": -0.009, + "num_tokens": 1203799.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 4060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 75.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08750255405902863, + "kl": 0.008228718303143978, + "learning_rate": 1.98e-06, + "loss": 0.0004, + "num_tokens": 1204108.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 75.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011226206086575985, + "kl": 0.26686032116413116, + "learning_rate": 1.9796666666666665e-06, + "loss": 0.0133, + "num_tokens": 1204412.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 75.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017707949737086892, + "kl": 0.0035208910703659058, + "learning_rate": 1.9793333333333332e-06, + "loss": 0.0002, + "num_tokens": 1204648.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 75.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4164939522743225, + "kl": 0.049409836530685425, + "learning_rate": 1.979e-06, + "loss": 0.0025, + "num_tokens": 1204947.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 75.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021025974303483963, + "kl": 0.002871686825528741, + "learning_rate": 1.978666666666667e-06, + "loss": 0.0001, + "num_tokens": 1205243.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 75.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06047816202044487, + "kl": 0.0044610954355448484, + "learning_rate": 1.9783333333333336e-06, + "loss": 0.0002, + "num_tokens": 1205517.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 75.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2698228657245636, + "kl": 0.026295214891433716, + "learning_rate": 1.978e-06, + "loss": 0.0013, + "num_tokens": 1205851.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 75.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3328896462917328, + "kl": 0.052076924592256546, + "learning_rate": 1.9776666666666667e-06, + "loss": 0.0031, + "num_tokens": 1206137.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 75.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028835749253630638, + "kl": 0.15647006034851074, + "learning_rate": 1.9773333333333335e-06, + "loss": 0.0078, + "num_tokens": 1206446.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 75.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09617263823747635, + "kl": 0.1250823512673378, + "learning_rate": 1.9770000000000003e-06, + "loss": 0.0063, + "num_tokens": 1206818.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 75.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.27157941460609436, + "kl": 0.01233864901587367, + "learning_rate": 1.9766666666666667e-06, + "loss": 0.0006, + "num_tokens": 1207096.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 24.25, + "completions/mean_terminated_length": 24.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 75.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6838143467903137, + "kl": 0.13872230052947998, + "learning_rate": 1.9763333333333334e-06, + "loss": 0.007, + "num_tokens": 1207429.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 23.75, + "completions/mean_terminated_length": 23.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 75.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1603475660085678, + "kl": 0.05809360183775425, + "learning_rate": 1.976e-06, + "loss": 0.0029, + "num_tokens": 1207752.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 75.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044732771813869476, + "kl": 0.0029091377509757876, + "learning_rate": 1.9756666666666666e-06, + "loss": 0.0001, + "num_tokens": 1208061.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 75.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10663247853517532, + "kl": 0.029886224307119846, + "learning_rate": 1.9753333333333334e-06, + "loss": 0.0015, + "num_tokens": 1208359.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 75.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0019416072173044086, + "kl": 0.00015534833073616028, + "learning_rate": 1.975e-06, + "loss": 0.0, + "num_tokens": 1208603.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 75.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041577428579330444, + "kl": 0.003687289310619235, + "learning_rate": 1.974666666666667e-06, + "loss": 0.0002, + "num_tokens": 1208867.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 75.51851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.235536098480225, + "kl": 0.772625168930972, + "learning_rate": 1.9743333333333333e-06, + "loss": 0.2187, + "num_tokens": 1209150.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 4078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 75.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10694680362939835, + "kl": 0.021603311877697706, + "learning_rate": 1.974e-06, + "loss": 0.0012, + "num_tokens": 1209422.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 75.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11569098383188248, + "kl": 0.006086892099119723, + "learning_rate": 1.9736666666666664e-06, + "loss": 0.0002, + "num_tokens": 1209676.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 7.5, + "completions/mean_terminated_length": 7.5, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 75.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2142038643360138, + "kl": 0.020505985245108604, + "learning_rate": 1.9733333333333336e-06, + "loss": 0.0014, + "num_tokens": 1209906.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 75.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027767445892095566, + "kl": 0.0015846788883209229, + "learning_rate": 1.973e-06, + "loss": 0.0001, + "num_tokens": 1210118.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 75.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1347692906856537, + "kl": 0.02943311259150505, + "learning_rate": 1.9726666666666668e-06, + "loss": 0.0016, + "num_tokens": 1210448.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 11.75, + "completions/mean_terminated_length": 11.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 75.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.636884868144989, + "kl": 0.04091236554086208, + "learning_rate": 1.9723333333333336e-06, + "loss": 0.0023, + "num_tokens": 1210715.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 75.64814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6120728254318237, + "kl": 0.03641462483210489, + "learning_rate": 1.972e-06, + "loss": 0.0048, + "num_tokens": 1210995.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 4085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 75.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010601680725812912, + "kl": 0.01476895809173584, + "learning_rate": 1.9716666666666667e-06, + "loss": 0.0007, + "num_tokens": 1211255.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 75.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028710326179862022, + "kl": 0.0011701772746164352, + "learning_rate": 1.9713333333333335e-06, + "loss": 0.0001, + "num_tokens": 1211525.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 75.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.30854856967926025, + "kl": 0.022281265817582607, + "learning_rate": 1.9710000000000003e-06, + "loss": 0.0012, + "num_tokens": 1211852.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 75.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08179502189159393, + "kl": 0.010333703365176916, + "learning_rate": 1.9706666666666666e-06, + "loss": 0.0005, + "num_tokens": 1212182.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 75.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03348976746201515, + "kl": 0.0028405068442225456, + "learning_rate": 1.9703333333333334e-06, + "loss": 0.0001, + "num_tokens": 1212494.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 75.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0213506780564785, + "kl": 0.0006847196927992627, + "learning_rate": 1.9699999999999998e-06, + "loss": 0.0, + "num_tokens": 1212729.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 75.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04438813403248787, + "kl": 0.0009940594318322837, + "learning_rate": 1.9696666666666666e-06, + "loss": 0.0, + "num_tokens": 1212985.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 24.75, + "completions/mean_terminated_length": 24.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 75.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039385054260492325, + "kl": 0.027652304619550705, + "learning_rate": 1.9693333333333333e-06, + "loss": 0.0014, + "num_tokens": 1213304.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 75.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016336074098944664, + "kl": 0.0006586983799934387, + "learning_rate": 1.969e-06, + "loss": 0.0, + "num_tokens": 1213564.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 75.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002488235186319798, + "kl": 9.147077798843384e-05, + "learning_rate": 1.968666666666667e-06, + "loss": 0.0, + "num_tokens": 1213784.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 4095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 28.25, + "completions/mean_terminated_length": 28.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 75.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4082089960575104, + "kl": 0.09059792757034302, + "learning_rate": 1.9683333333333333e-06, + "loss": 0.0033, + "num_tokens": 1214149.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 75.87037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.916356086730957, + "kl": 0.010026805510278791, + "learning_rate": 1.968e-06, + "loss": 0.1483, + "num_tokens": 1214480.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 4097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 30.25, + "completions/mean_terminated_length": 30.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 75.88888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6402992010116577, + "kl": 0.08111809473484755, + "learning_rate": 1.9676666666666664e-06, + "loss": -0.0642, + "num_tokens": 1214817.0, + "reward": 4.625, + "reward_std": 2.25, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 2.25, + "step": 4098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 75.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026547765359282494, + "kl": 0.008576929569244385, + "learning_rate": 1.9673333333333336e-06, + "loss": 0.0004, + "num_tokens": 1215139.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 24.25, + "completions/mean_terminated_length": 24.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 75.92592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.0618815422058105, + "kl": 0.014985653106123209, + "learning_rate": 1.967e-06, + "loss": 0.2613, + "num_tokens": 1215452.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 4100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 75.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014280945062637329, + "kl": 0.002806713921017945, + "learning_rate": 1.9666666666666668e-06, + "loss": 0.0001, + "num_tokens": 1215736.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 75.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01078316941857338, + "kl": 0.0038907453417778015, + "learning_rate": 1.9663333333333335e-06, + "loss": 0.0002, + "num_tokens": 1215952.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 75.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024382075294852257, + "kl": 0.04322320222854614, + "learning_rate": 1.966e-06, + "loss": 0.0022, + "num_tokens": 1216356.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 4103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.003846153849735856, + "clip_ratio/low_min": 0.003846153849735856, + "clip_ratio/region_mean": 0.003846153849735856, + "completion_length": 49.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 97.0, + "completions/max_terminated_length": 97.0, + "completions/mean_length": 49.0, + "completions/mean_terminated_length": 49.0, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 76.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.079254627227783, + "kl": 0.03499941527843475, + "learning_rate": 1.9656666666666667e-06, + "loss": 0.2508, + "num_tokens": 1216776.0, + "reward": 5.625, + "reward_std": 3.75, + "rewards/reward_combined/mean": 5.625, + "rewards/reward_combined/std": 3.75, + "step": 4104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 76.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3840368390083313, + "kl": 0.057057078927755356, + "learning_rate": 1.9653333333333335e-06, + "loss": 0.0025, + "num_tokens": 1217045.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 76.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024285269901156425, + "kl": 0.0397414518520236, + "learning_rate": 1.9650000000000002e-06, + "loss": 0.002, + "num_tokens": 1217335.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 76.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010353813879191875, + "kl": 0.00384463369846344, + "learning_rate": 1.9646666666666666e-06, + "loss": 0.0002, + "num_tokens": 1217551.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 76.07407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4024927616119385, + "kl": 0.016710405237972736, + "learning_rate": 1.9643333333333334e-06, + "loss": 0.0268, + "num_tokens": 1217884.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 4108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 76.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05337845906615257, + "kl": 0.15523239225149155, + "learning_rate": 1.9639999999999997e-06, + "loss": 0.0078, + "num_tokens": 1218194.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 76.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1349731832742691, + "kl": 0.008913870726246387, + "learning_rate": 1.9636666666666665e-06, + "loss": 0.0004, + "num_tokens": 1218460.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 76.12962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.7003679275512695, + "kl": 0.014345615170896053, + "learning_rate": 1.9633333333333337e-06, + "loss": 0.2829, + "num_tokens": 1218752.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 4111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 12.75, + "completions/mean_terminated_length": 12.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 76.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5468757748603821, + "kl": 0.05816850659903139, + "learning_rate": 1.963e-06, + "loss": 0.0039, + "num_tokens": 1219023.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 76.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014043119736015797, + "kl": 0.002794499625451863, + "learning_rate": 1.962666666666667e-06, + "loss": 0.0001, + "num_tokens": 1219307.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 76.18518518518519, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.786628007888794, + "kl": 0.03232190012931824, + "learning_rate": 1.9623333333333332e-06, + "loss": 0.0844, + "num_tokens": 1219629.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 4114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 76.20370370370371, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2288224697113037, + "kl": 0.015211954480037093, + "learning_rate": 1.962e-06, + "loss": -0.006, + "num_tokens": 1219961.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 4115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 76.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009010828100144863, + "kl": 0.04074282944202423, + "learning_rate": 1.961666666666667e-06, + "loss": 0.002, + "num_tokens": 1220366.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 4116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 76.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0738772600889206, + "kl": 0.011800558771938086, + "learning_rate": 1.9613333333333336e-06, + "loss": 0.0006, + "num_tokens": 1220634.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 76.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1359788477420807, + "kl": 0.026944361627101898, + "learning_rate": 1.961e-06, + "loss": 0.0014, + "num_tokens": 1220956.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 76.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09950874745845795, + "kl": 0.002559813321568072, + "learning_rate": 1.9606666666666667e-06, + "loss": 0.0002, + "num_tokens": 1221166.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 76.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12391962110996246, + "kl": 0.02209593402221799, + "learning_rate": 1.9603333333333335e-06, + "loss": 0.0013, + "num_tokens": 1221448.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 76.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06753460317850113, + "kl": 0.0011758595937862992, + "learning_rate": 1.96e-06, + "loss": 0.0001, + "num_tokens": 1221704.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 76.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08313508331775665, + "kl": 0.004922310006804764, + "learning_rate": 1.9596666666666667e-06, + "loss": 0.0002, + "num_tokens": 1221969.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 76.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002995326940435916, + "kl": 7.545948028564453e-05, + "learning_rate": 1.9593333333333334e-06, + "loss": 0.0, + "num_tokens": 1222189.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 4123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 76.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14050045609474182, + "kl": 0.026244450360536575, + "learning_rate": 1.9590000000000002e-06, + "loss": 0.0013, + "num_tokens": 1222515.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 76.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001680325367487967, + "kl": 0.00012325122952461243, + "learning_rate": 1.9586666666666666e-06, + "loss": 0.0, + "num_tokens": 1222759.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 76.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5730146169662476, + "kl": 0.05556255113333464, + "learning_rate": 1.9583333333333334e-06, + "loss": 0.003, + "num_tokens": 1223096.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 76.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061254117637872696, + "kl": 0.002744505414739251, + "learning_rate": 1.9579999999999997e-06, + "loss": 0.0001, + "num_tokens": 1223392.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 76.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07077538967132568, + "kl": 0.0038628350594080985, + "learning_rate": 1.957666666666667e-06, + "loss": 0.0002, + "num_tokens": 1223713.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.25, + "completions/mean_terminated_length": 12.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 76.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0732441321015358, + "kl": 0.008444469727692194, + "learning_rate": 1.9573333333333337e-06, + "loss": 0.0005, + "num_tokens": 1223986.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 76.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05600501224398613, + "kl": 0.0019288398325443268, + "learning_rate": 1.957e-06, + "loss": 0.0001, + "num_tokens": 1224246.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 76.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0069036902859807014, + "kl": 0.0007605880673509091, + "learning_rate": 1.956666666666667e-06, + "loss": 0.0, + "num_tokens": 1224506.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 76.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009192047640681267, + "kl": 0.001749998889863491, + "learning_rate": 1.956333333333333e-06, + "loss": 0.0001, + "num_tokens": 1224818.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 76.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12667137384414673, + "kl": 0.042274574749171734, + "learning_rate": 1.956e-06, + "loss": 0.002, + "num_tokens": 1225119.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 28.5, + "completions/mean_terminated_length": 28.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 76.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08845563232898712, + "kl": 0.022082612849771976, + "learning_rate": 1.9556666666666668e-06, + "loss": 0.0011, + "num_tokens": 1225461.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 76.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11005394905805588, + "kl": 0.008632947457954288, + "learning_rate": 1.9553333333333336e-06, + "loss": 0.0004, + "num_tokens": 1225790.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 76.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08012217283248901, + "kl": 0.11442025378346443, + "learning_rate": 1.955e-06, + "loss": 0.0057, + "num_tokens": 1226162.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 76.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3155250549316406, + "kl": 0.3680860660970211, + "learning_rate": 1.9546666666666667e-06, + "loss": -0.0116, + "num_tokens": 1226499.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 4137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 76.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012257328256964684, + "kl": 0.26668278872966766, + "learning_rate": 1.9543333333333335e-06, + "loss": 0.0133, + "num_tokens": 1226803.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 76.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0027750497683882713, + "kl": 6.220936666068155e-05, + "learning_rate": 1.954e-06, + "loss": 0.0, + "num_tokens": 1227023.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 76.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09977468103170395, + "kl": 0.012493321672081947, + "learning_rate": 1.9536666666666666e-06, + "loss": 0.0006, + "num_tokens": 1227312.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 76.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.7096132040023804, + "kl": 0.25764910224825144, + "learning_rate": 1.9533333333333334e-06, + "loss": 0.014, + "num_tokens": 1227599.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 32.25, + "completions/mean_terminated_length": 32.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 76.70370370370371, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3478972911834717, + "kl": 0.0591567512601614, + "learning_rate": 1.953e-06, + "loss": 0.1207, + "num_tokens": 1227944.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 4142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.75, + "completions/mean_terminated_length": 11.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 76.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13027948141098022, + "kl": 0.009938912931829691, + "learning_rate": 1.9526666666666665e-06, + "loss": 0.0005, + "num_tokens": 1228215.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 76.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021632181480526924, + "kl": 0.0007337778806686401, + "learning_rate": 1.9523333333333333e-06, + "loss": 0.0, + "num_tokens": 1228427.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 36.5, + "completions/mean_terminated_length": 36.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 76.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11164669692516327, + "kl": 0.03634909354150295, + "learning_rate": 1.9519999999999997e-06, + "loss": 0.0018, + "num_tokens": 1228797.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 76.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002292829332873225, + "kl": 0.0034130513668060303, + "learning_rate": 1.951666666666667e-06, + "loss": 0.0002, + "num_tokens": 1229033.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 76.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01807132549583912, + "kl": 0.0042166029452346265, + "learning_rate": 1.9513333333333337e-06, + "loss": 0.0002, + "num_tokens": 1229321.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 76.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02342912368476391, + "kl": 0.00029052793979644775, + "learning_rate": 1.951e-06, + "loss": 0.0, + "num_tokens": 1229533.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 76.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09676840901374817, + "kl": 0.008191230474039912, + "learning_rate": 1.950666666666667e-06, + "loss": 0.0004, + "num_tokens": 1229839.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 76.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09010336548089981, + "kl": 0.005421877605840564, + "learning_rate": 1.950333333333333e-06, + "loss": 0.0003, + "num_tokens": 1230073.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 76.87037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.130167007446289, + "kl": 0.30865128501318395, + "learning_rate": 1.95e-06, + "loss": 0.2629, + "num_tokens": 1230364.0, + "reward": 6.125, + "reward_std": 3.75, + "rewards/reward_combined/mean": 6.125, + "rewards/reward_combined/std": 3.75, + "step": 4151 + }, + { + "clip_ratio/high_max": 0.013513513840734959, + "clip_ratio/high_mean": 0.013513513840734959, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.013513513840734959, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 76.88888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0889623165130615, + "kl": 0.06862466409802437, + "learning_rate": 1.9496666666666667e-06, + "loss": 0.0408, + "num_tokens": 1230662.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 4152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 24.75, + "completions/mean_terminated_length": 24.75, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 76.9074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.209710121154785, + "kl": 0.05009952932596207, + "learning_rate": 1.9493333333333335e-06, + "loss": 0.2324, + "num_tokens": 1231013.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 4153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 39.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 39.5, + "completions/mean_terminated_length": 39.5, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 76.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06806277483701706, + "kl": 0.06461020186543465, + "learning_rate": 1.949e-06, + "loss": 0.0032, + "num_tokens": 1231399.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 76.94444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9049837589263916, + "kl": 0.0178663469851017, + "learning_rate": 1.9486666666666667e-06, + "loss": 0.0293, + "num_tokens": 1231706.0, + "reward": 5.125, + "reward_std": 5.421792507171631, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 5.421792507171631, + "step": 4155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 76.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011730004101991653, + "kl": 0.014396419283002615, + "learning_rate": 1.9483333333333335e-06, + "loss": 0.0007, + "num_tokens": 1231966.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 76.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06506339460611343, + "kl": 0.006239487323909998, + "learning_rate": 1.948e-06, + "loss": 0.0003, + "num_tokens": 1232238.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 77.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00833575427532196, + "kl": 0.0010321637091692537, + "learning_rate": 1.947666666666667e-06, + "loss": 0.0001, + "num_tokens": 1232547.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 77.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18150325119495392, + "kl": 0.03735784627497196, + "learning_rate": 1.9473333333333334e-06, + "loss": 0.0019, + "num_tokens": 1232847.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 77.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004244080279022455, + "kl": 0.0013645078288391232, + "learning_rate": 1.947e-06, + "loss": 0.0001, + "num_tokens": 1233127.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 77.05555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2485275268554688, + "kl": 0.03940440155565739, + "learning_rate": 1.9466666666666665e-06, + "loss": -0.0533, + "num_tokens": 1233456.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 4161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 77.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10510266572237015, + "kl": 0.00682351685827598, + "learning_rate": 1.9463333333333333e-06, + "loss": 0.0003, + "num_tokens": 1233722.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 77.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03601933270692825, + "kl": 0.002549659227952361, + "learning_rate": 1.946e-06, + "loss": 0.0001, + "num_tokens": 1234024.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 77.11111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.331364870071411, + "kl": 0.29403063654899597, + "learning_rate": 1.945666666666667e-06, + "loss": 0.0277, + "num_tokens": 1234329.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 4164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 33.0, + "completions/mean_terminated_length": 33.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 77.12962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4280591011047363, + "kl": 0.7362418845295906, + "learning_rate": 1.9453333333333337e-06, + "loss": 0.0889, + "num_tokens": 1234741.0, + "reward": 1.625, + "reward_std": 1.6007810831069946, + "rewards/reward_combined/mean": 1.625, + "rewards/reward_combined/std": 1.6007810831069946, + "step": 4165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 77.14814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2602388858795166, + "kl": 0.36333779245615005, + "learning_rate": 1.945e-06, + "loss": 0.0423, + "num_tokens": 1235080.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 4166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 77.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024190451949834824, + "kl": 0.0007255449891090393, + "learning_rate": 1.944666666666667e-06, + "loss": 0.0, + "num_tokens": 1235323.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 77.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039306215941905975, + "kl": 0.0032889824360609055, + "learning_rate": 1.944333333333333e-06, + "loss": 0.0002, + "num_tokens": 1235620.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 77.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0383589006960392, + "kl": 0.007975178305059671, + "learning_rate": 1.944e-06, + "loss": 0.0004, + "num_tokens": 1235900.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 28.5, + "completions/mean_terminated_length": 28.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 77.22222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.775885581970215, + "kl": 0.13293201848864555, + "learning_rate": 1.9436666666666667e-06, + "loss": 0.2955, + "num_tokens": 1236254.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 4170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 77.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002294854959473014, + "kl": 0.003396354615688324, + "learning_rate": 1.9433333333333335e-06, + "loss": 0.0002, + "num_tokens": 1236490.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 77.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7070120573043823, + "kl": 0.041661586612463, + "learning_rate": 1.943e-06, + "loss": 0.002, + "num_tokens": 1236759.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 77.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01104077510535717, + "kl": 0.003519028425216675, + "learning_rate": 1.9426666666666666e-06, + "loss": 0.0002, + "num_tokens": 1236975.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 77.29629629629629, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.336381435394287, + "kl": 0.02045949501916766, + "learning_rate": 1.9423333333333334e-06, + "loss": 0.1234, + "num_tokens": 1237276.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 4174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 77.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05067688226699829, + "kl": 0.0012031823571305722, + "learning_rate": 1.9419999999999998e-06, + "loss": 0.0001, + "num_tokens": 1237532.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 77.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10434551537036896, + "kl": 0.02033104095607996, + "learning_rate": 1.941666666666667e-06, + "loss": 0.001, + "num_tokens": 1237862.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 77.35185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.882805347442627, + "kl": 0.0539279232325498, + "learning_rate": 1.9413333333333334e-06, + "loss": -0.0388, + "num_tokens": 1238138.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 4177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 77.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013030019588768482, + "kl": 0.00021567940711975098, + "learning_rate": 1.941e-06, + "loss": 0.0, + "num_tokens": 1238350.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 77.38888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.609494686126709, + "kl": 0.023763020522892475, + "learning_rate": 1.9406666666666665e-06, + "loss": 0.0798, + "num_tokens": 1238622.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 4179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.75, + "completions/mean_terminated_length": 9.75, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 77.4074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.1220316886901855, + "kl": 0.12488710437901318, + "learning_rate": 1.9403333333333333e-06, + "loss": -0.0249, + "num_tokens": 1238881.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 4180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 77.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08558971434831619, + "kl": 0.0049365556333214045, + "learning_rate": 1.94e-06, + "loss": 0.0002, + "num_tokens": 1239177.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 77.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015483858063817024, + "kl": 0.003704962902702391, + "learning_rate": 1.939666666666667e-06, + "loss": 0.0002, + "num_tokens": 1239467.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 77.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019819265231490135, + "kl": 0.001352352846879512, + "learning_rate": 1.9393333333333336e-06, + "loss": 0.0001, + "num_tokens": 1239735.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 55.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 113.0, + "completions/max_terminated_length": 113.0, + "completions/mean_length": 55.0, + "completions/mean_terminated_length": 55.0, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 77.48148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8206136226654053, + "kl": 0.03712160140275955, + "learning_rate": 1.939e-06, + "loss": 0.2776, + "num_tokens": 1240179.0, + "reward": 6.25, + "reward_std": 2.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 2.5, + "step": 4184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 77.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04253672435879707, + "kl": 0.011192928068339825, + "learning_rate": 1.9386666666666668e-06, + "loss": 0.0006, + "num_tokens": 1240513.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 77.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003173008153680712, + "kl": 6.728619337081909e-05, + "learning_rate": 1.938333333333333e-06, + "loss": 0.0, + "num_tokens": 1240733.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 4186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 77.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02005135826766491, + "kl": 0.002891370910219848, + "learning_rate": 1.938e-06, + "loss": 0.0001, + "num_tokens": 1241017.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 41.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 41.5, + "completions/mean_terminated_length": 41.5, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 77.55555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3200020790100098, + "kl": 0.08114099875092506, + "learning_rate": 1.9376666666666667e-06, + "loss": 0.133, + "num_tokens": 1241399.0, + "reward": 4.0, + "reward_std": 2.915475845336914, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 2.915475845336914, + "step": 4188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 77.57407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.443061351776123, + "kl": 0.05532825365662575, + "learning_rate": 1.9373333333333335e-06, + "loss": 0.0109, + "num_tokens": 1241705.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 4189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 77.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029117180034518242, + "kl": 0.09699800238013268, + "learning_rate": 1.9370000000000003e-06, + "loss": 0.0048, + "num_tokens": 1242077.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 77.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011976920068264008, + "kl": 6.387233770510647e-05, + "learning_rate": 1.9366666666666666e-06, + "loss": 0.0, + "num_tokens": 1242297.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 28.25, + "completions/mean_terminated_length": 28.25, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 77.62962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.93695330619812, + "kl": 0.11477098986506462, + "learning_rate": 1.9363333333333334e-06, + "loss": -0.0766, + "num_tokens": 1242646.0, + "reward": 5.375, + "reward_std": 2.462214469909668, + "rewards/reward_combined/mean": 5.375, + "rewards/reward_combined/std": 2.462214469909668, + "step": 4192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 77.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.39902207255363464, + "kl": 0.07843837421387434, + "learning_rate": 1.936e-06, + "loss": 0.0037, + "num_tokens": 1242942.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 35.0, + "completions/mean_terminated_length": 35.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 77.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2322455644607544, + "kl": 0.04774339310824871, + "learning_rate": 1.935666666666667e-06, + "loss": 0.0025, + "num_tokens": 1243302.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 77.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08009503036737442, + "kl": 0.026713049970567226, + "learning_rate": 1.9353333333333333e-06, + "loss": 0.0013, + "num_tokens": 1243604.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 77.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02505462057888508, + "kl": 0.0008030809694901109, + "learning_rate": 1.935e-06, + "loss": 0.0, + "num_tokens": 1243866.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 77.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004699228331446648, + "kl": 0.0010910580749623477, + "learning_rate": 1.9346666666666665e-06, + "loss": 0.0001, + "num_tokens": 1244178.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 77.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041575249284505844, + "kl": 0.011271217401372269, + "learning_rate": 1.9343333333333333e-06, + "loss": 0.0006, + "num_tokens": 1244464.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 77.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008147178217768669, + "kl": 0.00954483076930046, + "learning_rate": 1.934e-06, + "loss": 0.0005, + "num_tokens": 1244736.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 77.77777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.629997730255127, + "kl": 0.8537364536896348, + "learning_rate": 1.933666666666667e-06, + "loss": 0.0701, + "num_tokens": 1244997.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 4200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 77.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014347920194268227, + "kl": 0.0005748532712459564, + "learning_rate": 1.9333333333333336e-06, + "loss": 0.0, + "num_tokens": 1245257.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 77.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10747025907039642, + "kl": 0.007933998480439186, + "learning_rate": 1.933e-06, + "loss": 0.0004, + "num_tokens": 1245515.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 77.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013548709452152252, + "kl": 0.002063589170575142, + "learning_rate": 1.9326666666666667e-06, + "loss": 0.0001, + "num_tokens": 1245827.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 77.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16767403483390808, + "kl": 0.030538485618308187, + "learning_rate": 1.932333333333333e-06, + "loss": 0.0014, + "num_tokens": 1246147.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 77.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03566483408212662, + "kl": 0.0016097147017717361, + "learning_rate": 1.9320000000000003e-06, + "loss": 0.0001, + "num_tokens": 1246416.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 77.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009168892866000533, + "kl": 2.0645558834075928e-05, + "learning_rate": 1.9316666666666667e-06, + "loss": 0.0, + "num_tokens": 1246628.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 77.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14718596637248993, + "kl": 0.04609527066349983, + "learning_rate": 1.9313333333333334e-06, + "loss": 0.0024, + "num_tokens": 1246982.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 37.5, + "completions/mean_terminated_length": 37.5, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 77.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.308185338973999, + "kl": 0.10179235972464085, + "learning_rate": 1.9310000000000002e-06, + "loss": 0.0051, + "num_tokens": 1247360.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 77.94444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.932782173156738, + "kl": 0.09911003150045872, + "learning_rate": 1.9306666666666666e-06, + "loss": 0.0901, + "num_tokens": 1247644.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 4209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 77.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011597269214689732, + "kl": 0.00032469630241394043, + "learning_rate": 1.9303333333333334e-06, + "loss": 0.0, + "num_tokens": 1247854.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 77.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049700118601322174, + "kl": 0.0016231692279689014, + "learning_rate": 1.93e-06, + "loss": 0.0001, + "num_tokens": 1248088.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 78.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03323694318532944, + "kl": 0.0014779643970541656, + "learning_rate": 1.929666666666667e-06, + "loss": 0.0001, + "num_tokens": 1248413.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 78.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14461404085159302, + "kl": 0.032939229160547256, + "learning_rate": 1.9293333333333333e-06, + "loss": 0.0017, + "num_tokens": 1248709.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 78.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026254257187247276, + "kl": 0.03676637168973684, + "learning_rate": 1.929e-06, + "loss": 0.0019, + "num_tokens": 1249001.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 78.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11635847389698029, + "kl": 0.004655453558370937, + "learning_rate": 1.9286666666666664e-06, + "loss": 0.0002, + "num_tokens": 1249257.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 78.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054019249975681305, + "kl": 0.0010344207403250039, + "learning_rate": 1.9283333333333332e-06, + "loss": 0.0, + "num_tokens": 1249470.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 78.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0921623483300209, + "kl": 0.01755582168698311, + "learning_rate": 1.928e-06, + "loss": 0.001, + "num_tokens": 1249780.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 78.11111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3513851165771484, + "kl": 0.0199673967435956, + "learning_rate": 1.927666666666667e-06, + "loss": 0.0582, + "num_tokens": 1250122.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 4218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 78.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13223950564861298, + "kl": 0.007777614053338766, + "learning_rate": 1.9273333333333336e-06, + "loss": 0.0004, + "num_tokens": 1250439.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 78.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09461899846792221, + "kl": 0.008546177297830582, + "learning_rate": 1.927e-06, + "loss": 0.0004, + "num_tokens": 1250682.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 28.0, + "completions/mean_terminated_length": 28.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 78.16666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6419639587402344, + "kl": 0.047966865822672844, + "learning_rate": 1.9266666666666667e-06, + "loss": -0.1356, + "num_tokens": 1251046.0, + "reward": 4.0, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 4.041451930999756, + "step": 4221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 78.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07959099113941193, + "kl": 0.010591855272650719, + "learning_rate": 1.926333333333333e-06, + "loss": 0.0005, + "num_tokens": 1251314.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 78.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.652174711227417, + "kl": 0.10362341441214085, + "learning_rate": 1.9260000000000003e-06, + "loss": 0.0061, + "num_tokens": 1251600.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 27.5, + "completions/mean_terminated_length": 27.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 78.22222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.025179386138916, + "kl": 0.1205407865345478, + "learning_rate": 1.9256666666666666e-06, + "loss": 0.0572, + "num_tokens": 1251946.0, + "reward": 6.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 6.5, + "rewards/reward_combined/std": 2.0, + "step": 4224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 78.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014520753175020218, + "kl": 0.0024043945595622063, + "learning_rate": 1.9253333333333334e-06, + "loss": 0.0001, + "num_tokens": 1252242.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 78.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041846100240945816, + "kl": 0.0011122730211354792, + "learning_rate": 1.925e-06, + "loss": 0.0001, + "num_tokens": 1252518.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 78.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003778262296691537, + "kl": 8.817017078399658e-05, + "learning_rate": 1.9246666666666666e-06, + "loss": 0.0, + "num_tokens": 1252730.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 78.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08723165839910507, + "kl": 0.023326152935624123, + "learning_rate": 1.9243333333333333e-06, + "loss": 0.001, + "num_tokens": 1253053.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 78.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033983197063207626, + "kl": 0.009079038631170988, + "learning_rate": 1.924e-06, + "loss": 0.0004, + "num_tokens": 1253342.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 34.75, + "completions/mean_terminated_length": 34.75, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 78.33333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4294841289520264, + "kl": 0.05626204237341881, + "learning_rate": 1.923666666666667e-06, + "loss": -0.092, + "num_tokens": 1253709.0, + "reward": 7.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.875, + "rewards/reward_combined/std": 0.25, + "step": 4230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 78.35185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.3585896492004395, + "kl": 0.017691759392619133, + "learning_rate": 1.9233333333333333e-06, + "loss": 0.0667, + "num_tokens": 1253992.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 4231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 41.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 41.75, + "completions/mean_terminated_length": 41.75, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 78.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4049912095069885, + "kl": 0.06708768382668495, + "learning_rate": 1.923e-06, + "loss": 0.0034, + "num_tokens": 1254375.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 78.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013180306181311607, + "kl": 0.0010220229742117226, + "learning_rate": 1.9226666666666664e-06, + "loss": 0.0001, + "num_tokens": 1254695.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4233 + }, + { + "clip_ratio/high_max": 0.007352941203862429, + "clip_ratio/high_mean": 0.007352941203862429, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007352941203862429, + "completion_length": 30.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 30.0, + "completions/mean_terminated_length": 30.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 78.4074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9212491512298584, + "kl": 0.10197229124605656, + "learning_rate": 1.922333333333333e-06, + "loss": 0.0834, + "num_tokens": 1255043.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 4234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 78.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006084556225687265, + "kl": 0.0003930516541004181, + "learning_rate": 1.9220000000000004e-06, + "loss": 0.0, + "num_tokens": 1255303.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 78.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05273579806089401, + "kl": 0.016235220013186336, + "learning_rate": 1.9216666666666668e-06, + "loss": 0.0009, + "num_tokens": 1255619.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 78.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12951022386550903, + "kl": 0.025173373520374298, + "learning_rate": 1.9213333333333335e-06, + "loss": 0.0013, + "num_tokens": 1255919.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 78.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005158380605280399, + "kl": 0.2680702954530716, + "learning_rate": 1.921e-06, + "loss": 0.0134, + "num_tokens": 1256223.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 78.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.31500211358070374, + "kl": 0.037970778765156865, + "learning_rate": 1.9206666666666667e-06, + "loss": 0.0018, + "num_tokens": 1256505.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.75, + "completions/mean_terminated_length": 10.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 78.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.29990875720977783, + "kl": 0.03192344726994634, + "learning_rate": 1.9203333333333335e-06, + "loss": 0.0016, + "num_tokens": 1256776.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 78.53703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.599093437194824, + "kl": 0.03590135369449854, + "learning_rate": 1.9200000000000003e-06, + "loss": 0.0341, + "num_tokens": 1257115.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 4241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 78.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008764144033193588, + "kl": 0.009205047506839037, + "learning_rate": 1.9196666666666666e-06, + "loss": 0.0005, + "num_tokens": 1257387.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 78.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03793429583311081, + "kl": 0.0028215666534379125, + "learning_rate": 1.9193333333333334e-06, + "loss": 0.0001, + "num_tokens": 1257689.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 78.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010651134885847569, + "kl": 0.003851078450679779, + "learning_rate": 1.919e-06, + "loss": 0.0002, + "num_tokens": 1257905.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 78.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003185986424796283, + "kl": 6.905943155288696e-05, + "learning_rate": 1.9186666666666665e-06, + "loss": 0.0, + "num_tokens": 1258125.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 4245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 78.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03568355366587639, + "kl": 0.0044396971934475005, + "learning_rate": 1.9183333333333333e-06, + "loss": 0.0002, + "num_tokens": 1258395.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 78.64814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.023728370666504, + "kl": 0.024265441112220287, + "learning_rate": 1.918e-06, + "loss": 0.1552, + "num_tokens": 1258663.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 4247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 78.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03483027592301369, + "kl": 0.005986180156469345, + "learning_rate": 1.917666666666667e-06, + "loss": 0.0003, + "num_tokens": 1258954.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 78.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0063624382019043, + "kl": 0.05899708718061447, + "learning_rate": 1.9173333333333332e-06, + "loss": 0.0032, + "num_tokens": 1259220.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 78.70370370370371, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.068668365478516, + "kl": 0.0071154829929582775, + "learning_rate": 1.917e-06, + "loss": 0.198, + "num_tokens": 1259493.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 4250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 78.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10169423371553421, + "kl": 0.005140399909578264, + "learning_rate": 1.9166666666666664e-06, + "loss": 0.0003, + "num_tokens": 1259710.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 34.5, + "completions/mean_terminated_length": 34.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 78.74074074074075, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.462865352630615, + "kl": 0.060255058109760284, + "learning_rate": 1.9163333333333336e-06, + "loss": -0.0394, + "num_tokens": 1260072.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 4252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 78.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07884665578603745, + "kl": 0.001622125506401062, + "learning_rate": 1.9160000000000004e-06, + "loss": 0.0001, + "num_tokens": 1260276.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 78.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038811977952718735, + "kl": 0.002060721308225766, + "learning_rate": 1.9156666666666667e-06, + "loss": 0.0001, + "num_tokens": 1260546.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 32.0, + "completions/mean_terminated_length": 32.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 78.79629629629629, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9585207104682922, + "kl": 0.14760783314704895, + "learning_rate": 1.9153333333333335e-06, + "loss": 0.0345, + "num_tokens": 1260954.0, + "reward": 1.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 1.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 4255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 78.81481481481481, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.040985584259033, + "kl": 0.0993734747171402, + "learning_rate": 1.915e-06, + "loss": 0.0031, + "num_tokens": 1261326.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 4256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 78.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001683708280324936, + "kl": 0.0035288333892822266, + "learning_rate": 1.9146666666666667e-06, + "loss": 0.0002, + "num_tokens": 1261562.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 34.75, + "completions/mean_terminated_length": 34.75, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 78.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24805141985416412, + "kl": 0.07200213894248009, + "learning_rate": 1.9143333333333334e-06, + "loss": 0.0024, + "num_tokens": 1261921.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 78.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00754777854308486, + "kl": 0.0004747495841002092, + "learning_rate": 1.9140000000000002e-06, + "loss": 0.0, + "num_tokens": 1262156.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 78.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13387122750282288, + "kl": 0.00983450561761856, + "learning_rate": 1.9136666666666666e-06, + "loss": 0.0005, + "num_tokens": 1262414.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 78.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04390999674797058, + "kl": 0.004116417956538498, + "learning_rate": 1.9133333333333334e-06, + "loss": 0.0002, + "num_tokens": 1262696.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 78.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18405263125896454, + "kl": 0.04805176518857479, + "learning_rate": 1.913e-06, + "loss": 0.0024, + "num_tokens": 1263022.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 23.25, + "completions/mean_terminated_length": 23.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 78.94444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.680548906326294, + "kl": 0.1030915305018425, + "learning_rate": 1.9126666666666665e-06, + "loss": 0.0631, + "num_tokens": 1263343.0, + "reward": 4.625, + "reward_std": 2.25, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 2.25, + "step": 4263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 78.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013858833350241184, + "kl": 0.16277816146612167, + "learning_rate": 1.9123333333333333e-06, + "loss": 0.0081, + "num_tokens": 1263651.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 78.98148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.160861968994141, + "kl": 0.12520997156389058, + "learning_rate": 1.912e-06, + "loss": 0.0064, + "num_tokens": 1263927.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 4265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 79.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09493882209062576, + "kl": 0.019914139062166214, + "learning_rate": 1.911666666666667e-06, + "loss": 0.001, + "num_tokens": 1264246.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 79.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005010351072996855, + "kl": 0.0014900097157806158, + "learning_rate": 1.9113333333333332e-06, + "loss": 0.0001, + "num_tokens": 1264526.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 79.03703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 10.642378807067871, + "kl": 0.14753237552940845, + "learning_rate": 1.911e-06, + "loss": -0.0592, + "num_tokens": 1264804.0, + "reward": 6.625, + "reward_std": 2.0966243743896484, + "rewards/reward_combined/mean": 6.625, + "rewards/reward_combined/std": 2.0966243743896484, + "step": 4268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 79.05555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2820652723312378, + "kl": 0.32148877531290054, + "learning_rate": 1.9106666666666664e-06, + "loss": 0.0041, + "num_tokens": 1265173.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 4269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 34.25, + "completions/mean_terminated_length": 34.25, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 79.07407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5127744674682617, + "kl": 0.08697371184825897, + "learning_rate": 1.9103333333333336e-06, + "loss": 0.0199, + "num_tokens": 1265526.0, + "reward": 4.0, + "reward_std": 2.915475845336914, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 2.915475845336914, + "step": 4270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 79.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06507395207881927, + "kl": 0.007719706802163273, + "learning_rate": 1.9100000000000003e-06, + "loss": 0.0004, + "num_tokens": 1265800.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 79.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00924857147037983, + "kl": 0.0010975684854201972, + "learning_rate": 1.9096666666666667e-06, + "loss": 0.0001, + "num_tokens": 1266119.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 79.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025987261906266212, + "kl": 0.00031992196454666555, + "learning_rate": 1.9093333333333335e-06, + "loss": 0.0, + "num_tokens": 1266332.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 79.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025542592629790306, + "kl": 0.006770234787836671, + "learning_rate": 1.909e-06, + "loss": 0.0003, + "num_tokens": 1266621.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 8.25, + "completions/mean_terminated_length": 8.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 79.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15291957557201385, + "kl": 0.008299468085169792, + "learning_rate": 1.9086666666666666e-06, + "loss": 0.0004, + "num_tokens": 1266866.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 79.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10867872834205627, + "kl": 0.016073176288045943, + "learning_rate": 1.9083333333333334e-06, + "loss": 0.0008, + "num_tokens": 1267195.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 76.75, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 76.75, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 79.20370370370371, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4292832612991333, + "kl": 0.08965224772691727, + "learning_rate": 1.908e-06, + "loss": 0.0318, + "num_tokens": 1267742.0, + "reward": 2.299999952316284, + "reward_std": 6.58989143371582, + "rewards/reward_combined/mean": 2.299999952316284, + "rewards/reward_combined/std": 6.58989143371582, + "step": 4277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 79.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03266540542244911, + "kl": 0.001321264458965743, + "learning_rate": 1.9076666666666666e-06, + "loss": 0.0001, + "num_tokens": 1268014.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 79.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030367404222488403, + "kl": 0.0009278854122385383, + "learning_rate": 1.9073333333333333e-06, + "loss": 0.0001, + "num_tokens": 1268230.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 79.25925925925925, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.661116123199463, + "kl": 0.08012433722615242, + "learning_rate": 1.9070000000000001e-06, + "loss": 0.0896, + "num_tokens": 1268572.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 4280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 30.5, + "completions/mean_terminated_length": 30.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 79.27777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.027522087097168, + "kl": 0.030927312094718218, + "learning_rate": 1.9066666666666667e-06, + "loss": 0.0657, + "num_tokens": 1268922.0, + "reward": 5.75, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 4.5, + "step": 4281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 79.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07460688799619675, + "kl": 0.012812409084290266, + "learning_rate": 1.9063333333333335e-06, + "loss": 0.0006, + "num_tokens": 1269251.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 79.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1361246407032013, + "kl": 0.015147800091654062, + "learning_rate": 1.906e-06, + "loss": 0.0007, + "num_tokens": 1269520.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 79.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.38312283158302307, + "kl": 0.0375064592808485, + "learning_rate": 1.9056666666666668e-06, + "loss": 0.0019, + "num_tokens": 1269780.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 79.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06417759507894516, + "kl": 0.013725708704441786, + "learning_rate": 1.9053333333333332e-06, + "loss": 0.0007, + "num_tokens": 1270074.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 35.0, + "completions/mean_terminated_length": 35.0, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 79.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0872919112443924, + "kl": 0.037317905575037, + "learning_rate": 1.905e-06, + "loss": 0.0019, + "num_tokens": 1270438.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 79.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06286707520484924, + "kl": 0.003374706720933318, + "learning_rate": 1.9046666666666665e-06, + "loss": 0.0002, + "num_tokens": 1270734.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 79.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003144270449411124, + "kl": 7.129460573196411e-05, + "learning_rate": 1.9043333333333333e-06, + "loss": 0.0, + "num_tokens": 1270954.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 4288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 79.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031572021543979645, + "kl": 0.0025535791646689177, + "learning_rate": 1.9040000000000001e-06, + "loss": 0.0001, + "num_tokens": 1271208.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 79.44444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.994157791137695, + "kl": 0.0037134106969460845, + "learning_rate": 1.9036666666666667e-06, + "loss": 0.1702, + "num_tokens": 1271485.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 4290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 79.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001585761085152626, + "kl": 0.0035687386989593506, + "learning_rate": 1.9033333333333335e-06, + "loss": 0.0002, + "num_tokens": 1271721.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 5.25, + "completions/mean_terminated_length": 5.25, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 79.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07599655538797379, + "kl": 0.0033965239708777517, + "learning_rate": 1.903e-06, + "loss": 0.0002, + "num_tokens": 1271942.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4292 + }, + { + "clip_ratio/high_max": 0.0055555556900799274, + "clip_ratio/high_mean": 0.0055555556900799274, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0055555556900799274, + "completion_length": 45.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 45.25, + "completions/mean_terminated_length": 45.25, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 79.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.595132350921631, + "kl": 0.26547348499298096, + "learning_rate": 1.9026666666666668e-06, + "loss": -0.0063, + "num_tokens": 1272351.0, + "reward": 2.25, + "reward_std": 1.443375587463379, + "rewards/reward_combined/mean": 2.25, + "rewards/reward_combined/std": 1.4433757066726685, + "step": 4293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 79.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028872394934296608, + "kl": 0.0007407168741337955, + "learning_rate": 1.9023333333333332e-06, + "loss": 0.0, + "num_tokens": 1272563.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 79.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08227001875638962, + "kl": 0.015970894135534763, + "learning_rate": 1.9020000000000002e-06, + "loss": 0.0008, + "num_tokens": 1272865.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 79.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0045527382753789425, + "kl": 0.26818473637104034, + "learning_rate": 1.9016666666666665e-06, + "loss": 0.0134, + "num_tokens": 1273169.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 79.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004742157179862261, + "kl": 0.01580418087542057, + "learning_rate": 1.9013333333333333e-06, + "loss": 0.0008, + "num_tokens": 1273429.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 79.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04324128106236458, + "kl": 0.008228184015024453, + "learning_rate": 1.901e-06, + "loss": 0.0004, + "num_tokens": 1273715.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 79.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3525957763195038, + "kl": 0.03991496190428734, + "learning_rate": 1.9006666666666667e-06, + "loss": 0.0023, + "num_tokens": 1273995.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 79.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0027048857882618904, + "kl": 0.001616847701370716, + "learning_rate": 1.9003333333333334e-06, + "loss": 0.0001, + "num_tokens": 1274307.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 79.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18871232867240906, + "kl": 0.02549402043223381, + "learning_rate": 1.9e-06, + "loss": 0.0014, + "num_tokens": 1274591.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 79.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10089059174060822, + "kl": 0.011142004746943712, + "learning_rate": 1.8996666666666668e-06, + "loss": 0.0005, + "num_tokens": 1274884.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 79.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10412485152482986, + "kl": 0.018473886884748936, + "learning_rate": 1.8993333333333332e-06, + "loss": 0.0009, + "num_tokens": 1275172.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 79.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021944435313344002, + "kl": 0.002671961672604084, + "learning_rate": 1.8990000000000002e-06, + "loss": 0.0001, + "num_tokens": 1275456.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4304 + }, + { + "clip_ratio/high_max": 0.017126270104199648, + "clip_ratio/high_mean": 0.017126270104199648, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.017126270104199648, + "completion_length": 29.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 29.5, + "completions/mean_terminated_length": 29.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 79.72222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4490644931793213, + "kl": 0.0364563032053411, + "learning_rate": 1.8986666666666665e-06, + "loss": 0.0091, + "num_tokens": 1275826.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 4305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 33.0, + "completions/mean_terminated_length": 33.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 79.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09895407408475876, + "kl": 0.05819042772054672, + "learning_rate": 1.8983333333333333e-06, + "loss": 0.003, + "num_tokens": 1276238.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 4306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 79.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053417712450027466, + "kl": 0.0011590928334044293, + "learning_rate": 1.898e-06, + "loss": 0.0001, + "num_tokens": 1276494.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 79.77777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.106990814208984, + "kl": 0.04312201403081417, + "learning_rate": 1.8976666666666667e-06, + "loss": 0.0704, + "num_tokens": 1276838.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 4308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 79.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005180777050554752, + "kl": 0.0009722212562337518, + "learning_rate": 1.8973333333333334e-06, + "loss": 0.0, + "num_tokens": 1277150.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 79.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06083007901906967, + "kl": 0.03393148444592953, + "learning_rate": 1.897e-06, + "loss": 0.0017, + "num_tokens": 1277452.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 79.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23645886778831482, + "kl": 0.022639069007709622, + "learning_rate": 1.8966666666666668e-06, + "loss": 0.0012, + "num_tokens": 1277714.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 79.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11366508156061172, + "kl": 0.027273572981357574, + "learning_rate": 1.8963333333333331e-06, + "loss": 0.0014, + "num_tokens": 1278020.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.25, + "completions/mean_terminated_length": 6.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 79.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041096366941928864, + "kl": 0.0013561142259277403, + "learning_rate": 1.8960000000000001e-06, + "loss": 0.0001, + "num_tokens": 1278253.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 79.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009637952782213688, + "kl": 0.0044071972370147705, + "learning_rate": 1.895666666666667e-06, + "loss": 0.0002, + "num_tokens": 1278469.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 79.9074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.735550880432129, + "kl": 0.04328707233071327, + "learning_rate": 1.8953333333333333e-06, + "loss": 0.0801, + "num_tokens": 1278746.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 4315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 79.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17550866305828094, + "kl": 0.027146801352500916, + "learning_rate": 1.8950000000000003e-06, + "loss": 0.0012, + "num_tokens": 1279068.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 79.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08663085103034973, + "kl": 0.023260490968823433, + "learning_rate": 1.8946666666666666e-06, + "loss": 0.0012, + "num_tokens": 1279397.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 79.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08788569271564484, + "kl": 0.012014943495159969, + "learning_rate": 1.8943333333333334e-06, + "loss": 0.0006, + "num_tokens": 1279669.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 79.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04950766637921333, + "kl": 0.0012860782444477081, + "learning_rate": 1.894e-06, + "loss": 0.0001, + "num_tokens": 1279929.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 80.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2858528196811676, + "kl": 0.0411251001060009, + "learning_rate": 1.8936666666666668e-06, + "loss": 0.0022, + "num_tokens": 1280225.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 31.5, + "completions/mean_terminated_length": 31.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 80.01851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9411813020706177, + "kl": 0.062322698533535004, + "learning_rate": 1.8933333333333333e-06, + "loss": 0.0176, + "num_tokens": 1280631.0, + "reward": 2.375, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 1.25, + "step": 4321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 80.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003150520788040012, + "kl": 7.299333810806274e-05, + "learning_rate": 1.8930000000000001e-06, + "loss": 0.0, + "num_tokens": 1280851.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 4322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 80.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15832574665546417, + "kl": 0.015317570883780718, + "learning_rate": 1.892666666666667e-06, + "loss": 0.0008, + "num_tokens": 1281121.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 80.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05653592571616173, + "kl": 0.004452500492334366, + "learning_rate": 1.8923333333333333e-06, + "loss": 0.0002, + "num_tokens": 1281417.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 27.75, + "completions/mean_terminated_length": 27.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 80.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1150771901011467, + "kl": 0.03150841686874628, + "learning_rate": 1.8920000000000003e-06, + "loss": 0.0015, + "num_tokens": 1281756.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 80.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1055835708975792, + "kl": 0.007861072663217783, + "learning_rate": 1.8916666666666666e-06, + "loss": 0.0004, + "num_tokens": 1282054.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 80.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039390482008457184, + "kl": 0.0019255817751400173, + "learning_rate": 1.8913333333333334e-06, + "loss": 0.0001, + "num_tokens": 1282324.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 80.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018356727436184883, + "kl": 0.004501785384491086, + "learning_rate": 1.891e-06, + "loss": 0.0002, + "num_tokens": 1282615.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 80.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006139532197266817, + "kl": 0.0001577496514073573, + "learning_rate": 1.8906666666666668e-06, + "loss": 0.0, + "num_tokens": 1282871.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 80.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015639450401067734, + "kl": 0.002255136496387422, + "learning_rate": 1.8903333333333333e-06, + "loss": 0.0001, + "num_tokens": 1283173.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 80.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014675280544906855, + "kl": 0.0035926103591918945, + "learning_rate": 1.8900000000000001e-06, + "loss": 0.0002, + "num_tokens": 1283409.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 3.75, + "completions/mean_terminated_length": 3.75, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 80.22222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.857666969299316, + "kl": 0.0602201409637928, + "learning_rate": 1.889666666666667e-06, + "loss": 0.3233, + "num_tokens": 1283628.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 4332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 7.0, + "completions/mean_terminated_length": 7.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 80.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1367505043745041, + "kl": 0.006942069390788674, + "learning_rate": 1.8893333333333333e-06, + "loss": 0.0004, + "num_tokens": 1283876.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 80.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0051569328643381596, + "kl": 0.0009948793449439108, + "learning_rate": 1.8890000000000003e-06, + "loss": 0.0, + "num_tokens": 1284188.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 80.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04585277661681175, + "kl": 0.0010584443807601929, + "learning_rate": 1.8886666666666666e-06, + "loss": 0.0, + "num_tokens": 1284398.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 80.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05766990780830383, + "kl": 0.01633353903889656, + "learning_rate": 1.8883333333333334e-06, + "loss": 0.0008, + "num_tokens": 1284720.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 80.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03653598576784134, + "kl": 0.16410651803016663, + "learning_rate": 1.888e-06, + "loss": 0.0082, + "num_tokens": 1285029.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 80.33333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5428266525268555, + "kl": 0.3339054733514786, + "learning_rate": 1.8876666666666667e-06, + "loss": 0.0343, + "num_tokens": 1285334.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 4338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 80.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009092086926102638, + "kl": 0.004559628665447235, + "learning_rate": 1.8873333333333333e-06, + "loss": 0.0002, + "num_tokens": 1285550.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 80.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17435891926288605, + "kl": 0.006879238877445459, + "learning_rate": 1.887e-06, + "loss": 0.0004, + "num_tokens": 1285763.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 32.75, + "completions/mean_terminated_length": 32.75, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 80.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17169271409511566, + "kl": 0.05368756130337715, + "learning_rate": 1.8866666666666669e-06, + "loss": 0.0027, + "num_tokens": 1286118.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 80.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010143171064555645, + "kl": 0.0006027743220329285, + "learning_rate": 1.8863333333333335e-06, + "loss": 0.0, + "num_tokens": 1286378.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 80.42592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.433819532394409, + "kl": 0.024248501285910606, + "learning_rate": 1.8860000000000002e-06, + "loss": 0.0367, + "num_tokens": 1286712.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 4343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 80.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003716086270287633, + "kl": 0.000155717134475708, + "learning_rate": 1.8856666666666666e-06, + "loss": 0.0, + "num_tokens": 1286956.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 80.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04067979007959366, + "kl": 0.00615118513815105, + "learning_rate": 1.8853333333333334e-06, + "loss": 0.0003, + "num_tokens": 1287233.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 80.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004789065103977919, + "kl": 0.015812963247299194, + "learning_rate": 1.885e-06, + "loss": 0.0008, + "num_tokens": 1287493.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 80.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00406251847743988, + "kl": 0.0017495816573500633, + "learning_rate": 1.8846666666666667e-06, + "loss": 0.0001, + "num_tokens": 1287805.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 80.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1103120744228363, + "kl": 0.013902743812650442, + "learning_rate": 1.8843333333333333e-06, + "loss": 0.0009, + "num_tokens": 1288081.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 23.25, + "completions/mean_terminated_length": 23.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 80.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2596316337585449, + "kl": 0.05641137808561325, + "learning_rate": 1.884e-06, + "loss": 0.0028, + "num_tokens": 1288398.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 80.55555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6061272621154785, + "kl": 0.06760421209037304, + "learning_rate": 1.8836666666666669e-06, + "loss": 0.0217, + "num_tokens": 1288672.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 4350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 46.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 46.0, + "completions/mean_terminated_length": 46.0, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 80.57407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.706523895263672, + "kl": 0.04157466068863869, + "learning_rate": 1.8833333333333334e-06, + "loss": 0.1925, + "num_tokens": 1289076.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 4351 + }, + { + "clip_ratio/high_max": 0.008064515888690948, + "clip_ratio/high_mean": 0.008064515888690948, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008064515888690948, + "completion_length": 29.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 29.5, + "completions/mean_terminated_length": 29.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 80.5925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.284354329109192, + "kl": 0.08879881352186203, + "learning_rate": 1.8830000000000002e-06, + "loss": 0.0829, + "num_tokens": 1289430.0, + "reward": 2.875, + "reward_std": 3.3008837699890137, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 3.3008837699890137, + "step": 4352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 80.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07128682732582092, + "kl": 0.030314982868731022, + "learning_rate": 1.8826666666666666e-06, + "loss": 0.0015, + "num_tokens": 1289759.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 80.62962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.432647705078125, + "kl": 0.2135235331952572, + "learning_rate": 1.8823333333333334e-06, + "loss": -0.1782, + "num_tokens": 1290069.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 4354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 40.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 40.5, + "completions/mean_terminated_length": 40.5, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 80.64814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.060464859008789, + "kl": 0.1365949995815754, + "learning_rate": 1.882e-06, + "loss": 0.1264, + "num_tokens": 1290447.0, + "reward": 3.375, + "reward_std": 3.3008837699890137, + "rewards/reward_combined/mean": 3.375, + "rewards/reward_combined/std": 3.3008837699890137, + "step": 4355 + }, + { + "clip_ratio/high_max": 0.015625, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 80.66666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9406650066375732, + "kl": 0.013447611592710018, + "learning_rate": 1.8816666666666667e-06, + "loss": -0.0992, + "num_tokens": 1290725.0, + "reward": 7.625, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.625, + "rewards/reward_combined/std": 0.25, + "step": 4356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 80.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10507752746343613, + "kl": 0.01101256930269301, + "learning_rate": 1.8813333333333333e-06, + "loss": 0.0006, + "num_tokens": 1290989.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 80.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010743080638349056, + "kl": 0.0011632859241217375, + "learning_rate": 1.881e-06, + "loss": 0.0001, + "num_tokens": 1291309.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 80.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22154352068901062, + "kl": 0.012188445311039686, + "learning_rate": 1.8806666666666669e-06, + "loss": 0.0006, + "num_tokens": 1291581.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 80.74074074074075, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6668715476989746, + "kl": 0.024648002348840237, + "learning_rate": 1.8803333333333334e-06, + "loss": 0.1614, + "num_tokens": 1291934.0, + "reward": 5.75, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 4.5, + "step": 4360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 80.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005185931921005249, + "kl": 0.00021985769853927195, + "learning_rate": 1.8800000000000002e-06, + "loss": 0.0, + "num_tokens": 1292154.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 80.77777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.872429847717285, + "kl": 0.03848284587729722, + "learning_rate": 1.8796666666666666e-06, + "loss": 0.0051, + "num_tokens": 1292435.0, + "reward": 4.5, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 4.041451930999756, + "step": 4362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 80.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08073687553405762, + "kl": 0.010062988847494125, + "learning_rate": 1.8793333333333334e-06, + "loss": 0.0005, + "num_tokens": 1292693.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 36.5, + "completions/mean_terminated_length": 36.5, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 80.81481481481481, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2071800231933594, + "kl": 0.07048040628433228, + "learning_rate": 1.879e-06, + "loss": -0.0647, + "num_tokens": 1293067.0, + "reward": 7.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.875, + "rewards/reward_combined/std": 0.25, + "step": 4364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 80.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3753097653388977, + "kl": 0.042253438383340836, + "learning_rate": 1.8786666666666667e-06, + "loss": 0.002, + "num_tokens": 1293363.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 80.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12837550044059753, + "kl": 0.016710346564650536, + "learning_rate": 1.8783333333333333e-06, + "loss": 0.0008, + "num_tokens": 1293690.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 80.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005399358458817005, + "kl": 0.0003994263242930174, + "learning_rate": 1.878e-06, + "loss": 0.0, + "num_tokens": 1293925.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 80.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034773413091897964, + "kl": 0.0151158763183048, + "learning_rate": 1.8776666666666668e-06, + "loss": 0.0008, + "num_tokens": 1294212.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 80.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06649592518806458, + "kl": 0.030844278633594513, + "learning_rate": 1.8773333333333334e-06, + "loss": 0.0015, + "num_tokens": 1294496.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 80.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0175692830234766, + "kl": 0.0052419493440538645, + "learning_rate": 1.8770000000000002e-06, + "loss": 0.0003, + "num_tokens": 1294764.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 80.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08722560107707977, + "kl": 0.08403240889310837, + "learning_rate": 1.8766666666666666e-06, + "loss": 0.0043, + "num_tokens": 1295132.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 80.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09624875336885452, + "kl": 0.013174053281545639, + "learning_rate": 1.8763333333333336e-06, + "loss": 0.0007, + "num_tokens": 1295420.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.0, + "completions/max_terminated_length": 46.0, + "completions/mean_length": 28.75, + "completions/mean_terminated_length": 28.75, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 80.98148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.580333232879639, + "kl": 0.1536710560321808, + "learning_rate": 1.876e-06, + "loss": 0.1715, + "num_tokens": 1295755.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 4373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 81.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01680237427353859, + "kl": 0.0023757058661431074, + "learning_rate": 1.8756666666666667e-06, + "loss": 0.0001, + "num_tokens": 1296039.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 81.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03817089647054672, + "kl": 0.0018637944594956934, + "learning_rate": 1.8753333333333333e-06, + "loss": 0.0001, + "num_tokens": 1296309.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 81.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02150399424135685, + "kl": 0.0009942147298716009, + "learning_rate": 1.875e-06, + "loss": 0.0, + "num_tokens": 1296543.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 81.05555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.481693267822266, + "kl": 0.01328006386756897, + "learning_rate": 1.8746666666666668e-06, + "loss": -0.027, + "num_tokens": 1296813.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 4377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 81.07407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.564919948577881, + "kl": 0.015312770381569862, + "learning_rate": 1.8743333333333334e-06, + "loss": 0.1962, + "num_tokens": 1297126.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 4378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 7.0, + "completions/mean_terminated_length": 7.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 81.0925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.664789199829102, + "kl": 0.03134606145613361, + "learning_rate": 1.8740000000000002e-06, + "loss": 0.2246, + "num_tokens": 1297354.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 4379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 81.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006957850884646177, + "kl": 0.0016645299619995058, + "learning_rate": 1.8736666666666665e-06, + "loss": 0.0001, + "num_tokens": 1297634.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 81.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008181389421224594, + "kl": 0.0004776865243911743, + "learning_rate": 1.8733333333333335e-06, + "loss": 0.0, + "num_tokens": 1297842.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 81.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10321664065122604, + "kl": 0.011219854932278395, + "learning_rate": 1.873e-06, + "loss": 0.0005, + "num_tokens": 1298107.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 81.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4204522967338562, + "kl": 0.10652109235525131, + "learning_rate": 1.8726666666666667e-06, + "loss": 0.0054, + "num_tokens": 1298445.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 81.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11199964582920074, + "kl": 0.02174329198896885, + "learning_rate": 1.8723333333333333e-06, + "loss": 0.0011, + "num_tokens": 1298767.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 81.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0554036907851696, + "kl": 0.00952942413277924, + "learning_rate": 1.872e-06, + "loss": 0.0005, + "num_tokens": 1299056.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 81.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3276974856853485, + "kl": 0.04634904861450195, + "learning_rate": 1.8716666666666668e-06, + "loss": 0.0025, + "num_tokens": 1299342.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 81.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006416888441890478, + "kl": 0.010424769949167967, + "learning_rate": 1.8713333333333334e-06, + "loss": 0.0005, + "num_tokens": 1299614.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 81.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011799843050539494, + "kl": 0.0036661922931671143, + "learning_rate": 1.8710000000000002e-06, + "loss": 0.0002, + "num_tokens": 1299850.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.004587155766785145, + "clip_ratio/low_min": 0.004587155766785145, + "clip_ratio/region_mean": 0.004587155766785145, + "completion_length": 46.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 46.25, + "completions/mean_terminated_length": 46.25, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 81.27777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3603315353393555, + "kl": 0.1219240315258503, + "learning_rate": 1.8706666666666667e-06, + "loss": -0.0149, + "num_tokens": 1300251.0, + "reward": 3.25, + "reward_std": 3.3040380477905273, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 3.3040380477905273, + "step": 4389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 33.0, + "completions/mean_terminated_length": 33.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 81.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05270630866289139, + "kl": 0.04471280239522457, + "learning_rate": 1.8703333333333335e-06, + "loss": 0.0022, + "num_tokens": 1300663.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 4390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 81.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0057082148268818855, + "kl": 0.01567525276914239, + "learning_rate": 1.8699999999999999e-06, + "loss": 0.0008, + "num_tokens": 1300923.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 81.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003464862238615751, + "kl": 6.67572021484375e-05, + "learning_rate": 1.8696666666666667e-06, + "loss": 0.0, + "num_tokens": 1301143.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 4392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 81.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.232277512550354, + "kl": 0.020496641693171114, + "learning_rate": 1.8693333333333332e-06, + "loss": 0.0013, + "num_tokens": 1301424.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 81.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09789823740720749, + "kl": 0.016767382621765137, + "learning_rate": 1.869e-06, + "loss": 0.0008, + "num_tokens": 1301713.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 81.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09358594566583633, + "kl": 0.0170522122643888, + "learning_rate": 1.8686666666666668e-06, + "loss": 0.0008, + "num_tokens": 1302043.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 81.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6361145377159119, + "kl": 0.02818980673328042, + "learning_rate": 1.8683333333333334e-06, + "loss": 0.0014, + "num_tokens": 1302374.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 81.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19904351234436035, + "kl": 0.023975819582119584, + "learning_rate": 1.8680000000000002e-06, + "loss": 0.0014, + "num_tokens": 1302648.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 34.25, + "completions/mean_terminated_length": 34.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 81.44444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7266082763671875, + "kl": 0.043629519641399384, + "learning_rate": 1.8676666666666667e-06, + "loss": 0.0686, + "num_tokens": 1303009.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 4398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 81.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03461923822760582, + "kl": 0.001159273087978363, + "learning_rate": 1.8673333333333335e-06, + "loss": 0.0001, + "num_tokens": 1303269.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 81.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03138570860028267, + "kl": 0.00265646877232939, + "learning_rate": 1.8669999999999999e-06, + "loss": 0.0001, + "num_tokens": 1303580.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 81.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10708634555339813, + "kl": 0.03524964302778244, + "learning_rate": 1.8666666666666667e-06, + "loss": 0.0018, + "num_tokens": 1303908.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 81.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13446207344532013, + "kl": 0.03038789052516222, + "learning_rate": 1.8663333333333332e-06, + "loss": 0.0016, + "num_tokens": 1304208.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 81.53703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0122764110565186, + "kl": 0.15331338718533516, + "learning_rate": 1.866e-06, + "loss": -0.0109, + "num_tokens": 1304578.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 4403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 39.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 39.75, + "completions/mean_terminated_length": 39.75, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 81.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3191465139389038, + "kl": 0.0944865271449089, + "learning_rate": 1.8656666666666668e-06, + "loss": 0.0047, + "num_tokens": 1304965.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 81.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.34566548466682434, + "kl": 0.016131113283336163, + "learning_rate": 1.8653333333333334e-06, + "loss": 0.0011, + "num_tokens": 1305181.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 81.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025926288217306137, + "kl": 0.00026322901248931885, + "learning_rate": 1.8650000000000001e-06, + "loss": 0.0, + "num_tokens": 1305394.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 81.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08051861822605133, + "kl": 0.008359687402844429, + "learning_rate": 1.8646666666666667e-06, + "loss": 0.0004, + "num_tokens": 1305658.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 81.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021630890667438507, + "kl": 0.023254934698343277, + "learning_rate": 1.8643333333333335e-06, + "loss": 0.0013, + "num_tokens": 1305947.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 81.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014926270581781864, + "kl": 0.0019489709520712495, + "learning_rate": 1.8639999999999999e-06, + "loss": 0.0001, + "num_tokens": 1306231.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 81.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2556963860988617, + "kl": 0.07968771830201149, + "learning_rate": 1.8636666666666666e-06, + "loss": 0.0034, + "num_tokens": 1306554.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 23.25, + "completions/mean_terminated_length": 23.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 81.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.685092031955719, + "kl": 0.10645676963031292, + "learning_rate": 1.8633333333333332e-06, + "loss": 0.0054, + "num_tokens": 1306899.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 81.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03368373587727547, + "kl": 0.005524386069737375, + "learning_rate": 1.863e-06, + "loss": 0.0003, + "num_tokens": 1307201.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 81.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023091167211532593, + "kl": 0.0023625462781637907, + "learning_rate": 1.8626666666666668e-06, + "loss": 0.0001, + "num_tokens": 1307461.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 81.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009918338619172573, + "kl": 0.0012138157617300749, + "learning_rate": 1.8623333333333333e-06, + "loss": 0.0001, + "num_tokens": 1307721.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 81.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07133351266384125, + "kl": 0.16373159736394882, + "learning_rate": 1.8620000000000001e-06, + "loss": 0.0082, + "num_tokens": 1308032.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 81.77777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.798025608062744, + "kl": 0.10175292007625103, + "learning_rate": 1.8616666666666667e-06, + "loss": 0.1123, + "num_tokens": 1308341.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 4416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 81.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04014373570680618, + "kl": 0.003596288152039051, + "learning_rate": 1.8613333333333335e-06, + "loss": 0.0002, + "num_tokens": 1308653.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 40.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 40.75, + "completions/mean_terminated_length": 40.75, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 81.81481481481481, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1125853061676025, + "kl": 0.09719396103173494, + "learning_rate": 1.8609999999999998e-06, + "loss": -0.0585, + "num_tokens": 1309036.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 4418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 81.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0448373481631279, + "kl": 0.016422273591160774, + "learning_rate": 1.8606666666666668e-06, + "loss": 0.0008, + "num_tokens": 1309341.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 81.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010965530760586262, + "kl": 0.00019196867651771754, + "learning_rate": 1.8603333333333332e-06, + "loss": 0.0, + "num_tokens": 1309597.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 81.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08737992495298386, + "kl": 0.03834068216383457, + "learning_rate": 1.86e-06, + "loss": 0.0019, + "num_tokens": 1309872.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 81.88888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8967413902282715, + "kl": 0.011913027847185731, + "learning_rate": 1.8596666666666668e-06, + "loss": -0.001, + "num_tokens": 1310202.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 4422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 81.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008811669424176216, + "kl": 0.004763320088386536, + "learning_rate": 1.8593333333333333e-06, + "loss": 0.0002, + "num_tokens": 1310418.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 81.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01139820646494627, + "kl": 0.0010455269366502762, + "learning_rate": 1.8590000000000001e-06, + "loss": 0.0001, + "num_tokens": 1310736.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 76.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 76.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 81.94444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7842226028442383, + "kl": 0.010343640809878707, + "learning_rate": 1.8586666666666667e-06, + "loss": 0.4647, + "num_tokens": 1311260.0, + "reward": 6.300000190734863, + "reward_std": 2.4000000953674316, + "rewards/reward_combined/mean": 6.300000190734863, + "rewards/reward_combined/std": 2.3999998569488525, + "step": 4425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 81.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0674179419875145, + "kl": 0.004618597333319485, + "learning_rate": 1.8583333333333335e-06, + "loss": 0.0002, + "num_tokens": 1311503.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 81.98148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.542253017425537, + "kl": 0.3521061260253191, + "learning_rate": 1.8579999999999998e-06, + "loss": 0.0615, + "num_tokens": 1311803.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 4427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 82.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0021548960357904434, + "kl": 0.2686252146959305, + "learning_rate": 1.8576666666666668e-06, + "loss": 0.0134, + "num_tokens": 1312107.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 82.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07663135975599289, + "kl": 0.006256932392716408, + "learning_rate": 1.8573333333333332e-06, + "loss": 0.0003, + "num_tokens": 1312379.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 82.03703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25091290473938, + "kl": 0.42687382688745856, + "learning_rate": 1.857e-06, + "loss": -0.0498, + "num_tokens": 1312718.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 4430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 82.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052096232771873474, + "kl": 0.002888257906306535, + "learning_rate": 1.8566666666666667e-06, + "loss": 0.0001, + "num_tokens": 1312988.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 82.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06352750957012177, + "kl": 0.003940177790354937, + "learning_rate": 1.8563333333333333e-06, + "loss": 0.0002, + "num_tokens": 1313231.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 82.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22415760159492493, + "kl": 0.023728660540655255, + "learning_rate": 1.856e-06, + "loss": 0.0014, + "num_tokens": 1313558.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 82.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00037158382474444807, + "kl": 5.504488945007324e-05, + "learning_rate": 1.8556666666666667e-06, + "loss": 0.0, + "num_tokens": 1313778.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 4434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 82.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011057434603571892, + "kl": 0.0010029137483797967, + "learning_rate": 1.8553333333333335e-06, + "loss": 0.0001, + "num_tokens": 1314098.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 3.75, + "completions/mean_terminated_length": 3.75, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 82.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2809027433395386, + "kl": 0.020557444542646408, + "learning_rate": 1.8549999999999998e-06, + "loss": 0.0011, + "num_tokens": 1314309.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 82.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06133417412638664, + "kl": 0.26321327686309814, + "learning_rate": 1.8546666666666668e-06, + "loss": 0.0131, + "num_tokens": 1314615.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 82.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014822466764599085, + "kl": 0.003606371581554413, + "learning_rate": 1.8543333333333332e-06, + "loss": 0.0002, + "num_tokens": 1314851.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 82.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05624133348464966, + "kl": 0.00447840424021706, + "learning_rate": 1.854e-06, + "loss": 0.0002, + "num_tokens": 1315121.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 82.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10262904316186905, + "kl": 0.1373457908630371, + "learning_rate": 1.853666666666667e-06, + "loss": 0.0069, + "num_tokens": 1315493.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 82.24074074074075, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.449041843414307, + "kl": 0.022232317132875323, + "learning_rate": 1.8533333333333333e-06, + "loss": 0.0304, + "num_tokens": 1315822.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 4441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 82.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07518257200717926, + "kl": 0.012855518143624067, + "learning_rate": 1.853e-06, + "loss": 0.0006, + "num_tokens": 1316110.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 27.0, + "completions/mean_terminated_length": 27.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 82.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06869995594024658, + "kl": 0.057380760088562965, + "learning_rate": 1.8526666666666667e-06, + "loss": 0.0029, + "num_tokens": 1316454.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 82.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12256380915641785, + "kl": 0.01273546414449811, + "learning_rate": 1.8523333333333334e-06, + "loss": 0.0006, + "num_tokens": 1316721.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 82.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007562916725873947, + "kl": 0.0012484967592172325, + "learning_rate": 1.852e-06, + "loss": 0.0001, + "num_tokens": 1316941.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 23.75, + "completions/mean_terminated_length": 23.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 82.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22549545764923096, + "kl": 0.06006450392305851, + "learning_rate": 1.8516666666666668e-06, + "loss": 0.0033, + "num_tokens": 1317256.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 82.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17211684584617615, + "kl": 0.027469228953123093, + "learning_rate": 1.8513333333333336e-06, + "loss": 0.0016, + "num_tokens": 1317538.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 82.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20630426704883575, + "kl": 0.051430992782115936, + "learning_rate": 1.851e-06, + "loss": 0.0025, + "num_tokens": 1317811.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 82.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041266996413469315, + "kl": 0.0024755297927185893, + "learning_rate": 1.850666666666667e-06, + "loss": 0.0001, + "num_tokens": 1318071.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 23.75, + "completions/mean_terminated_length": 23.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 82.4074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.600343227386475, + "kl": 0.05811155028641224, + "learning_rate": 1.8503333333333333e-06, + "loss": 0.2471, + "num_tokens": 1318386.0, + "reward": 7.0, + "reward_std": 1.0, + "rewards/reward_combined/mean": 7.0, + "rewards/reward_combined/std": 1.0, + "step": 4450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.75, + "completions/mean_terminated_length": 11.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 82.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07347581535577774, + "kl": 0.01644020201638341, + "learning_rate": 1.85e-06, + "loss": 0.0009, + "num_tokens": 1318657.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 82.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04788690060377121, + "kl": 0.009458801476284862, + "learning_rate": 1.8496666666666666e-06, + "loss": 0.0005, + "num_tokens": 1318953.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 82.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008286603726446629, + "kl": 0.0015786755830049515, + "learning_rate": 1.8493333333333334e-06, + "loss": 0.0001, + "num_tokens": 1319265.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 82.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15357807278633118, + "kl": 0.00651755859144032, + "learning_rate": 1.849e-06, + "loss": 0.0003, + "num_tokens": 1319499.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 82.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13082171976566315, + "kl": 0.02668852312490344, + "learning_rate": 1.8486666666666668e-06, + "loss": 0.0014, + "num_tokens": 1319806.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 31.75, + "completions/mean_terminated_length": 31.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 82.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19236665964126587, + "kl": 0.03527451306581497, + "learning_rate": 1.8483333333333336e-06, + "loss": 0.0018, + "num_tokens": 1320161.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 82.53703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0416932106018066, + "kl": 0.06623013317584991, + "learning_rate": 1.848e-06, + "loss": 0.0051, + "num_tokens": 1320565.0, + "reward": 2.375, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 1.25, + "step": 4457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 82.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04172458499670029, + "kl": 0.0007762670575175434, + "learning_rate": 1.847666666666667e-06, + "loss": 0.0, + "num_tokens": 1320821.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 32.5, + "completions/mean_terminated_length": 32.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 82.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06234841048717499, + "kl": 0.02629727590829134, + "learning_rate": 1.8473333333333333e-06, + "loss": 0.0013, + "num_tokens": 1321175.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 82.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03425826132297516, + "kl": 0.164263017475605, + "learning_rate": 1.847e-06, + "loss": 0.0082, + "num_tokens": 1321484.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 82.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009655088186264038, + "kl": 0.004441611468791962, + "learning_rate": 1.8466666666666666e-06, + "loss": 0.0002, + "num_tokens": 1321700.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.006172839552164078, + "clip_ratio/low_min": 0.006172839552164078, + "clip_ratio/region_mean": 0.006172839552164078, + "completion_length": 44.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 44.5, + "completions/mean_terminated_length": 44.5, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 82.62962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3002357482910156, + "kl": 0.04747505113482475, + "learning_rate": 1.8463333333333334e-06, + "loss": -0.0896, + "num_tokens": 1322098.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 4462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 82.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01681666262447834, + "kl": 0.03840895835310221, + "learning_rate": 1.846e-06, + "loss": 0.002, + "num_tokens": 1322390.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 82.66666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.687446117401123, + "kl": 0.03372030612081289, + "learning_rate": 1.8456666666666668e-06, + "loss": 0.0906, + "num_tokens": 1322694.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 4464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 82.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05643114820122719, + "kl": 0.0071255359798669815, + "learning_rate": 1.8453333333333336e-06, + "loss": 0.0004, + "num_tokens": 1323000.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 82.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10460171103477478, + "kl": 0.0030441894195973873, + "learning_rate": 1.8450000000000001e-06, + "loss": 0.0002, + "num_tokens": 1323213.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 82.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019232025370001793, + "kl": 0.004105325788259506, + "learning_rate": 1.844666666666667e-06, + "loss": 0.0002, + "num_tokens": 1323504.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 82.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013890565373003483, + "kl": 0.0019186849240213633, + "learning_rate": 1.8443333333333333e-06, + "loss": 0.0001, + "num_tokens": 1323788.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 82.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02797258459031582, + "kl": 0.007735855877399445, + "learning_rate": 1.844e-06, + "loss": 0.0004, + "num_tokens": 1324120.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 82.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031194305047392845, + "kl": 0.005104768555611372, + "learning_rate": 1.8436666666666666e-06, + "loss": 0.0002, + "num_tokens": 1324378.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 82.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02669372223317623, + "kl": 0.0013609528541564941, + "learning_rate": 1.8433333333333334e-06, + "loss": 0.0001, + "num_tokens": 1324590.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 82.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0036326637491583824, + "kl": 0.0005698055028915405, + "learning_rate": 1.843e-06, + "loss": 0.0, + "num_tokens": 1324850.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 39.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 39.25, + "completions/mean_terminated_length": 39.25, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 82.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.212266743183136, + "kl": 0.07454855367541313, + "learning_rate": 1.8426666666666668e-06, + "loss": 0.0037, + "num_tokens": 1325235.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 82.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006403028033673763, + "kl": 0.0016177594079636037, + "learning_rate": 1.8423333333333335e-06, + "loss": 0.0001, + "num_tokens": 1325515.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 82.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09204453229904175, + "kl": 0.03152099810540676, + "learning_rate": 1.8420000000000001e-06, + "loss": 0.0016, + "num_tokens": 1325817.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4475 + }, + { + "clip_ratio/high_max": 0.012500000186264515, + "clip_ratio/high_mean": 0.012500000186264515, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.012500000186264515, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 82.88888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.091361999511719, + "kl": 0.0465207826346159, + "learning_rate": 1.8416666666666669e-06, + "loss": -0.067, + "num_tokens": 1326158.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 4476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 82.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00873350165784359, + "kl": 0.0015225483803078532, + "learning_rate": 1.8413333333333333e-06, + "loss": 0.0001, + "num_tokens": 1326464.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 82.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20845302939414978, + "kl": 0.02591664995998144, + "learning_rate": 1.841e-06, + "loss": 0.0012, + "num_tokens": 1326753.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 82.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028067268431186676, + "kl": 0.004073692311067134, + "learning_rate": 1.8406666666666666e-06, + "loss": 0.0002, + "num_tokens": 1327019.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 34.75, + "completions/mean_terminated_length": 34.75, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 82.96296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4394779205322266, + "kl": 0.06088544428348541, + "learning_rate": 1.8403333333333334e-06, + "loss": -0.0382, + "num_tokens": 1327374.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 4480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 82.98148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.835725784301758, + "kl": 0.18542409967631102, + "learning_rate": 1.84e-06, + "loss": 0.0704, + "num_tokens": 1327662.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 4481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 83.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00518677057698369, + "kl": 0.015772895887494087, + "learning_rate": 1.8396666666666667e-06, + "loss": 0.0008, + "num_tokens": 1327922.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 83.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06541557610034943, + "kl": 0.004301687586121261, + "learning_rate": 1.8393333333333335e-06, + "loss": 0.0002, + "num_tokens": 1328218.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 39.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 47.0, + "completions/max_terminated_length": 47.0, + "completions/mean_length": 39.75, + "completions/mean_terminated_length": 39.75, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 83.03703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.420662522315979, + "kl": 0.055525410920381546, + "learning_rate": 1.839e-06, + "loss": 0.0638, + "num_tokens": 1328593.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 4484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 34.5, + "completions/mean_terminated_length": 34.5, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 83.05555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.934332847595215, + "kl": 0.1555764600634575, + "learning_rate": 1.8386666666666669e-06, + "loss": -0.0027, + "num_tokens": 1328959.0, + "reward": 6.125, + "reward_std": 3.4247870445251465, + "rewards/reward_combined/mean": 6.125, + "rewards/reward_combined/std": 3.4247870445251465, + "step": 4485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 11.75, + "completions/mean_terminated_length": 11.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 83.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17491045594215393, + "kl": 0.012714287266135216, + "learning_rate": 1.8383333333333332e-06, + "loss": 0.0007, + "num_tokens": 1329226.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 83.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1922261267900467, + "kl": 0.031621651723980904, + "learning_rate": 1.838e-06, + "loss": 0.0016, + "num_tokens": 1329564.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 83.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1284618079662323, + "kl": 0.0060302456840872765, + "learning_rate": 1.8376666666666666e-06, + "loss": 0.0003, + "num_tokens": 1329828.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 83.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1297995150089264, + "kl": 0.045892647467553616, + "learning_rate": 1.8373333333333334e-06, + "loss": 0.002, + "num_tokens": 1330155.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 83.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18180488049983978, + "kl": 0.028762775473296642, + "learning_rate": 1.837e-06, + "loss": 0.0015, + "num_tokens": 1330448.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4490 + }, + { + "clip_ratio/high_max": 0.009615384973585606, + "clip_ratio/high_mean": 0.009615384973585606, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009615384973585606, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 83.16666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.960066080093384, + "kl": 0.04335315991193056, + "learning_rate": 1.8366666666666667e-06, + "loss": -0.0992, + "num_tokens": 1330764.0, + "reward": 2.75, + "reward_std": 2.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 2.5, + "step": 4491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 83.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021585283800959587, + "kl": 0.0071603478863835335, + "learning_rate": 1.8363333333333335e-06, + "loss": 0.0004, + "num_tokens": 1331032.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 83.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01160829234868288, + "kl": 0.0013812032993882895, + "learning_rate": 1.836e-06, + "loss": 0.0001, + "num_tokens": 1331302.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 83.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022795287892222404, + "kl": 0.0024616834707558155, + "learning_rate": 1.8356666666666669e-06, + "loss": 0.0001, + "num_tokens": 1331586.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 83.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09836888313293457, + "kl": 0.008340646279975772, + "learning_rate": 1.8353333333333332e-06, + "loss": 0.0003, + "num_tokens": 1331840.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 83.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008560781367123127, + "kl": 0.009677237831056118, + "learning_rate": 1.8350000000000002e-06, + "loss": 0.0005, + "num_tokens": 1332112.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 83.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0253881998360157, + "kl": 0.0003055781126022339, + "learning_rate": 1.8346666666666666e-06, + "loss": 0.0, + "num_tokens": 1332368.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 83.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.34124791622161865, + "kl": 0.06942594796419144, + "learning_rate": 1.8343333333333334e-06, + "loss": 0.0035, + "num_tokens": 1332640.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 83.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00034384283935651183, + "kl": 6.54458999633789e-05, + "learning_rate": 1.834e-06, + "loss": 0.0, + "num_tokens": 1332860.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 4499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 83.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043943312019109726, + "kl": 0.0016277527902275324, + "learning_rate": 1.8336666666666667e-06, + "loss": 0.0001, + "num_tokens": 1333136.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 83.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02193913236260414, + "kl": 0.0007171332836151123, + "learning_rate": 1.8333333333333335e-06, + "loss": 0.0, + "num_tokens": 1333348.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 83.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12639886140823364, + "kl": 0.03929758816957474, + "learning_rate": 1.833e-06, + "loss": 0.002, + "num_tokens": 1333646.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 83.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.30518773198127747, + "kl": 0.0242879445431754, + "learning_rate": 1.8326666666666669e-06, + "loss": 0.0012, + "num_tokens": 1333950.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 30.75, + "completions/mean_terminated_length": 30.75, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 83.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23015724122524261, + "kl": 0.08431190066039562, + "learning_rate": 1.8323333333333332e-06, + "loss": 0.0042, + "num_tokens": 1334353.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 4504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 83.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03932933136820793, + "kl": 0.006077993428334594, + "learning_rate": 1.8320000000000002e-06, + "loss": 0.0003, + "num_tokens": 1334630.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.5, + "completions/mean_terminated_length": 5.5, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 83.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09884496033191681, + "kl": 0.0037670237943530083, + "learning_rate": 1.8316666666666666e-06, + "loss": 0.0002, + "num_tokens": 1334852.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 83.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015210869023576379, + "kl": 0.003596954047679901, + "learning_rate": 1.8313333333333333e-06, + "loss": 0.0002, + "num_tokens": 1335088.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 83.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018884696066379547, + "kl": 0.0008276030421257019, + "learning_rate": 1.831e-06, + "loss": 0.0, + "num_tokens": 1335300.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 83.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.681485176086426, + "kl": 0.1734844595193863, + "learning_rate": 1.8306666666666667e-06, + "loss": 0.0868, + "num_tokens": 1335615.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 4509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 83.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.9553044438362122, + "kl": 0.08474664390087128, + "learning_rate": 1.8303333333333335e-06, + "loss": 0.0042, + "num_tokens": 1335831.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 83.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020320512354373932, + "kl": 0.0029306603828445077, + "learning_rate": 1.83e-06, + "loss": 0.0001, + "num_tokens": 1336066.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 83.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015633560717105865, + "kl": 0.002356857992708683, + "learning_rate": 1.8296666666666668e-06, + "loss": 0.0001, + "num_tokens": 1336378.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 35.0, + "completions/mean_terminated_length": 35.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 83.57407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.939454555511475, + "kl": 0.5071391463279724, + "learning_rate": 1.8293333333333332e-06, + "loss": 0.242, + "num_tokens": 1336746.0, + "reward": 7.5, + "reward_std": 1.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 1.0, + "step": 4513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 28.75, + "completions/mean_terminated_length": 28.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 83.5925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.015135765075684, + "kl": 0.15286946669220924, + "learning_rate": 1.8290000000000002e-06, + "loss": 0.019, + "num_tokens": 1337097.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 4514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 83.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02253352664411068, + "kl": 0.0007894709706306458, + "learning_rate": 1.8286666666666666e-06, + "loss": 0.0, + "num_tokens": 1337357.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 83.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037067148834466934, + "kl": 0.004079599282704294, + "learning_rate": 1.8283333333333333e-06, + "loss": 0.0002, + "num_tokens": 1337689.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 83.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16268889605998993, + "kl": 0.015642430866137147, + "learning_rate": 1.828e-06, + "loss": 0.0008, + "num_tokens": 1338017.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 83.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16796192526817322, + "kl": 0.022601601667702198, + "learning_rate": 1.8276666666666667e-06, + "loss": 0.0012, + "num_tokens": 1338307.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 83.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056297432631254196, + "kl": 0.008035860490053892, + "learning_rate": 1.8273333333333335e-06, + "loss": 0.0004, + "num_tokens": 1338605.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 83.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0612456314265728, + "kl": 0.025490344502031803, + "learning_rate": 1.827e-06, + "loss": 0.0014, + "num_tokens": 1338894.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 83.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08116748183965683, + "kl": 0.004820789908990264, + "learning_rate": 1.8266666666666668e-06, + "loss": 0.0002, + "num_tokens": 1339156.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 83.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1013159528374672, + "kl": 0.011585609056055546, + "learning_rate": 1.8263333333333334e-06, + "loss": 0.0006, + "num_tokens": 1339428.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 83.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1381785124540329, + "kl": 0.02544863522052765, + "learning_rate": 1.8260000000000002e-06, + "loss": 0.0013, + "num_tokens": 1339724.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 83.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05593863129615784, + "kl": 0.007262100873049349, + "learning_rate": 1.8256666666666665e-06, + "loss": 0.0004, + "num_tokens": 1340047.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 32.0, + "completions/mean_terminated_length": 32.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 83.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06183823570609093, + "kl": 0.030166875571012497, + "learning_rate": 1.8253333333333333e-06, + "loss": 0.0015, + "num_tokens": 1340399.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 83.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005003071390092373, + "kl": 0.001004202465992421, + "learning_rate": 1.8249999999999999e-06, + "loss": 0.0001, + "num_tokens": 1340711.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 83.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00254241144284606, + "kl": 0.26853884756565094, + "learning_rate": 1.8246666666666667e-06, + "loss": 0.0134, + "num_tokens": 1341015.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 3.75, + "completions/mean_terminated_length": 3.75, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 83.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10519613325595856, + "kl": 0.004050101386383176, + "learning_rate": 1.8243333333333335e-06, + "loss": 0.0003, + "num_tokens": 1341226.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 83.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7129215002059937, + "kl": 0.19857073947787285, + "learning_rate": 1.824e-06, + "loss": 0.0099, + "num_tokens": 1341598.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 83.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001951477024704218, + "kl": 0.0001305900514125824, + "learning_rate": 1.8236666666666668e-06, + "loss": 0.0, + "num_tokens": 1341842.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 12.75, + "completions/mean_terminated_length": 12.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 83.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05018823966383934, + "kl": 0.005668552126735449, + "learning_rate": 1.8233333333333334e-06, + "loss": 0.0002, + "num_tokens": 1342121.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 83.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5049600005149841, + "kl": 0.06107555702328682, + "learning_rate": 1.8230000000000002e-06, + "loss": 0.003, + "num_tokens": 1342448.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 41.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 41.25, + "completions/mean_terminated_length": 41.25, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 83.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24273280799388885, + "kl": 0.06250195764005184, + "learning_rate": 1.8226666666666665e-06, + "loss": 0.0022, + "num_tokens": 1342833.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 83.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07775144279003143, + "kl": 0.011150154285132885, + "learning_rate": 1.8223333333333333e-06, + "loss": 0.0006, + "num_tokens": 1343127.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 83.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028379112482070923, + "kl": 0.001139858883107081, + "learning_rate": 1.8219999999999999e-06, + "loss": 0.0001, + "num_tokens": 1343397.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 84.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006011796649545431, + "kl": 0.015576763078570366, + "learning_rate": 1.8216666666666667e-06, + "loss": 0.0008, + "num_tokens": 1343657.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 84.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0366351380944252, + "kl": 0.002985842409543693, + "learning_rate": 1.8213333333333334e-06, + "loss": 0.0001, + "num_tokens": 1343939.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 35.75, + "completions/mean_terminated_length": 35.75, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 84.03703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3243839740753174, + "kl": 0.24699855595827103, + "learning_rate": 1.821e-06, + "loss": -0.0631, + "num_tokens": 1344310.0, + "reward": 7.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.875, + "rewards/reward_combined/std": 0.25, + "step": 4538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 84.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09131787717342377, + "kl": 0.021963478066027164, + "learning_rate": 1.8206666666666668e-06, + "loss": 0.0012, + "num_tokens": 1344586.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 84.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06585431843996048, + "kl": 0.0061761485412716866, + "learning_rate": 1.8203333333333334e-06, + "loss": 0.0003, + "num_tokens": 1344889.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.5, + "completions/mean_terminated_length": 4.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 84.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23800045251846313, + "kl": 0.010036620311439037, + "learning_rate": 1.8200000000000002e-06, + "loss": 0.0005, + "num_tokens": 1345107.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 39.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 39.25, + "completions/mean_terminated_length": 39.25, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 84.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5954710841178894, + "kl": 0.0889275036752224, + "learning_rate": 1.8196666666666665e-06, + "loss": 0.0044, + "num_tokens": 1345480.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 33.0, + "completions/mean_terminated_length": 33.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 84.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026880059391260147, + "kl": 0.04410934820771217, + "learning_rate": 1.8193333333333335e-06, + "loss": 0.0022, + "num_tokens": 1345892.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 4543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 84.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07424087077379227, + "kl": 0.008653692668303847, + "learning_rate": 1.8189999999999999e-06, + "loss": 0.0004, + "num_tokens": 1346166.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 84.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003423725429456681, + "kl": 6.1817467212677e-05, + "learning_rate": 1.8186666666666666e-06, + "loss": 0.0, + "num_tokens": 1346386.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 4545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 84.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04150363802909851, + "kl": 0.0023117363452911377, + "learning_rate": 1.8183333333333334e-06, + "loss": 0.0001, + "num_tokens": 1346620.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 84.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09289748966693878, + "kl": 0.12808343768119812, + "learning_rate": 1.818e-06, + "loss": 0.0064, + "num_tokens": 1346992.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 84.22222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6736209392547607, + "kl": 0.04916405491530895, + "learning_rate": 1.8176666666666668e-06, + "loss": 0.1031, + "num_tokens": 1347304.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 4548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 84.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009528686292469501, + "kl": 0.009169904980808496, + "learning_rate": 1.8173333333333334e-06, + "loss": 0.0005, + "num_tokens": 1347576.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 84.25925925925925, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.0630974769592285, + "kl": 0.024663350079208612, + "learning_rate": 1.8170000000000001e-06, + "loss": 0.0756, + "num_tokens": 1347912.0, + "reward": 5.550000190734863, + "reward_std": 3.9000003337860107, + "rewards/reward_combined/mean": 5.550000190734863, + "rewards/reward_combined/std": 3.9000000953674316, + "step": 4550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 84.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21899208426475525, + "kl": 0.023449061438441277, + "learning_rate": 1.8166666666666665e-06, + "loss": 0.0012, + "num_tokens": 1348239.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 84.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1031763106584549, + "kl": 0.013997962232679129, + "learning_rate": 1.8163333333333335e-06, + "loss": 0.0007, + "num_tokens": 1348527.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 84.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006296938750892878, + "kl": 0.00042418017983436584, + "learning_rate": 1.8159999999999999e-06, + "loss": 0.0, + "num_tokens": 1348787.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 84.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030918868258595467, + "kl": 0.005808588815853, + "learning_rate": 1.8156666666666666e-06, + "loss": 0.0003, + "num_tokens": 1349121.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 84.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02945765294134617, + "kl": 0.0069765131920576096, + "learning_rate": 1.8153333333333334e-06, + "loss": 0.0003, + "num_tokens": 1349401.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 84.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002900908002629876, + "kl": 0.00014244019985198975, + "learning_rate": 1.815e-06, + "loss": 0.0, + "num_tokens": 1349645.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 84.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17949199676513672, + "kl": 0.03702153544872999, + "learning_rate": 1.8146666666666668e-06, + "loss": 0.0019, + "num_tokens": 1349915.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 84.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016543844249099493, + "kl": 0.003559909760951996, + "learning_rate": 1.8143333333333333e-06, + "loss": 0.0002, + "num_tokens": 1350151.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 84.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.062095727771520615, + "kl": 0.01190496701747179, + "learning_rate": 1.8140000000000001e-06, + "loss": 0.0006, + "num_tokens": 1350405.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 84.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09647462517023087, + "kl": 0.014620958943851292, + "learning_rate": 1.8136666666666665e-06, + "loss": 0.0007, + "num_tokens": 1350693.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 84.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007919061928987503, + "kl": 0.0029959604144096375, + "learning_rate": 1.8133333333333335e-06, + "loss": 0.0001, + "num_tokens": 1350909.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4561 + }, + { + "clip_ratio/high_max": 0.02380952425301075, + "clip_ratio/high_mean": 0.02380952425301075, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02380952425301075, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 84.48148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.815066814422607, + "kl": 0.18687807023525238, + "learning_rate": 1.8129999999999998e-06, + "loss": -0.0142, + "num_tokens": 1351211.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 4562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 84.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10813266038894653, + "kl": 0.019688505679368973, + "learning_rate": 1.8126666666666666e-06, + "loss": 0.001, + "num_tokens": 1351507.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 84.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10175729542970657, + "kl": 0.17746182531118393, + "learning_rate": 1.8123333333333336e-06, + "loss": 0.0089, + "num_tokens": 1351815.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 8.75, + "completions/mean_terminated_length": 8.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 84.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17559285461902618, + "kl": 0.028179899789392948, + "learning_rate": 1.812e-06, + "loss": 0.0014, + "num_tokens": 1352074.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 84.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0033068438060581684, + "kl": 0.0017240876331925392, + "learning_rate": 1.8116666666666668e-06, + "loss": 0.0001, + "num_tokens": 1352386.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 84.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16999752819538116, + "kl": 0.03488508611917496, + "learning_rate": 1.8113333333333333e-06, + "loss": 0.0016, + "num_tokens": 1352721.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 84.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002821570262312889, + "kl": 0.2684878706932068, + "learning_rate": 1.8110000000000001e-06, + "loss": 0.0134, + "num_tokens": 1353025.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 84.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005127115175127983, + "kl": 0.0008829649887047708, + "learning_rate": 1.8106666666666667e-06, + "loss": 0.0, + "num_tokens": 1353337.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 84.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1310882568359375, + "kl": 0.023206555051729083, + "learning_rate": 1.8103333333333335e-06, + "loss": 0.0011, + "num_tokens": 1353660.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 84.64814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.460738182067871, + "kl": 0.35734116565436125, + "learning_rate": 1.8100000000000002e-06, + "loss": 0.0924, + "num_tokens": 1353970.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 4571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 34.25, + "completions/mean_terminated_length": 34.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 84.66666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7891405820846558, + "kl": 0.08435942232608795, + "learning_rate": 1.8096666666666666e-06, + "loss": -0.0144, + "num_tokens": 1354331.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 4572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 84.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03023175522685051, + "kl": 0.0008117755642160773, + "learning_rate": 1.8093333333333336e-06, + "loss": 0.0001, + "num_tokens": 1354547.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 84.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16713611781597137, + "kl": 0.023551173508167267, + "learning_rate": 1.809e-06, + "loss": 0.0014, + "num_tokens": 1354849.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 84.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08143755048513412, + "kl": 0.02762952505145222, + "learning_rate": 1.8086666666666667e-06, + "loss": 0.0014, + "num_tokens": 1355137.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 84.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07869987189769745, + "kl": 0.002967948792502284, + "learning_rate": 1.8083333333333333e-06, + "loss": 0.0002, + "num_tokens": 1355347.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 84.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04444926232099533, + "kl": 0.0030876006931066513, + "learning_rate": 1.808e-06, + "loss": 0.0002, + "num_tokens": 1355616.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 84.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0590357631444931, + "kl": 0.007288108114153147, + "learning_rate": 1.8076666666666667e-06, + "loss": 0.0004, + "num_tokens": 1355889.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 84.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016484994441270828, + "kl": 0.0035086802672594786, + "learning_rate": 1.8073333333333334e-06, + "loss": 0.0002, + "num_tokens": 1356157.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 84.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08064191788434982, + "kl": 0.0109627153724432, + "learning_rate": 1.8070000000000002e-06, + "loss": 0.0006, + "num_tokens": 1356427.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 84.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03658544272184372, + "kl": 0.0006570935256604571, + "learning_rate": 1.8066666666666666e-06, + "loss": 0.0, + "num_tokens": 1356683.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 84.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18326403200626373, + "kl": 0.027295667678117752, + "learning_rate": 1.8063333333333336e-06, + "loss": 0.0014, + "num_tokens": 1356976.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 84.87037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.215245723724365, + "kl": 0.04609328508377075, + "learning_rate": 1.806e-06, + "loss": 0.0343, + "num_tokens": 1357288.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 4583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 30.5, + "completions/mean_terminated_length": 30.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 84.88888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9655568599700928, + "kl": 0.07130121439695358, + "learning_rate": 1.8056666666666667e-06, + "loss": 0.1517, + "num_tokens": 1357646.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 4584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 84.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06787163764238358, + "kl": 0.003359407768584788, + "learning_rate": 1.8053333333333333e-06, + "loss": 0.0002, + "num_tokens": 1357966.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 84.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0718170776963234, + "kl": 0.001834855880588293, + "learning_rate": 1.805e-06, + "loss": 0.0001, + "num_tokens": 1358234.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 50.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 50.5, + "completions/mean_terminated_length": 50.5, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 84.94444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5237232446670532, + "kl": 0.017707230523228645, + "learning_rate": 1.8046666666666667e-06, + "loss": 0.0478, + "num_tokens": 1358656.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 4587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 84.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016777342185378075, + "kl": 0.0011444509727880359, + "learning_rate": 1.8043333333333334e-06, + "loss": 0.0001, + "num_tokens": 1358916.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 84.98148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.729802370071411, + "kl": 0.022776659578084946, + "learning_rate": 1.8040000000000002e-06, + "loss": 0.1147, + "num_tokens": 1359245.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 4589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 85.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04702303186058998, + "kl": 0.0009015277028083801, + "learning_rate": 1.8036666666666668e-06, + "loss": 0.0, + "num_tokens": 1359458.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 85.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036120835691690445, + "kl": 0.007666812743991613, + "learning_rate": 1.8033333333333336e-06, + "loss": 0.0004, + "num_tokens": 1359792.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 85.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09750744700431824, + "kl": 0.0036141001619398594, + "learning_rate": 1.803e-06, + "loss": 0.0002, + "num_tokens": 1360050.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 85.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09926003962755203, + "kl": 0.020237690769135952, + "learning_rate": 1.8026666666666667e-06, + "loss": 0.001, + "num_tokens": 1360312.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 85.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028317492455244064, + "kl": 0.0011716534500010312, + "learning_rate": 1.8023333333333333e-06, + "loss": 0.0001, + "num_tokens": 1360532.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 85.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028795655816793442, + "kl": 0.013575200457125902, + "learning_rate": 1.802e-06, + "loss": 0.0007, + "num_tokens": 1360829.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 85.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0027327844873070717, + "kl": 0.2685043662786484, + "learning_rate": 1.8016666666666666e-06, + "loss": 0.0134, + "num_tokens": 1361133.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 85.12962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5779788494110107, + "kl": 0.7720355167984962, + "learning_rate": 1.8013333333333334e-06, + "loss": 0.0974, + "num_tokens": 1361370.0, + "reward": 3.375, + "reward_std": 1.25, + "rewards/reward_combined/mean": 3.375, + "rewards/reward_combined/std": 1.25, + "step": 4597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 29.75, + "completions/mean_terminated_length": 29.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 85.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14958490431308746, + "kl": 0.030108027160167694, + "learning_rate": 1.8010000000000002e-06, + "loss": 0.0016, + "num_tokens": 1361717.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 85.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06679612398147583, + "kl": 0.003531815833412111, + "learning_rate": 1.8006666666666668e-06, + "loss": 0.0002, + "num_tokens": 1361951.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 85.18518518518519, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7573078870773315, + "kl": 0.06498152017593384, + "learning_rate": 1.8003333333333336e-06, + "loss": 0.0022, + "num_tokens": 1362356.0, + "reward": 2.375, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 1.25, + "step": 4600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 85.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03526627644896507, + "kl": 0.014735556207597256, + "learning_rate": 1.8e-06, + "loss": 0.0007, + "num_tokens": 1362660.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 85.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07709838449954987, + "kl": 0.008507295278832316, + "learning_rate": 1.7996666666666667e-06, + "loss": 0.0004, + "num_tokens": 1362958.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 85.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11057507991790771, + "kl": 0.014811470173299313, + "learning_rate": 1.7993333333333333e-06, + "loss": 0.0007, + "num_tokens": 1363230.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 85.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004880153574049473, + "kl": 0.001476626261137426, + "learning_rate": 1.799e-06, + "loss": 0.0001, + "num_tokens": 1363510.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 85.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16540856659412384, + "kl": 0.02626362256705761, + "learning_rate": 1.7986666666666666e-06, + "loss": 0.0013, + "num_tokens": 1363782.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 23.25, + "completions/mean_terminated_length": 23.25, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 85.29629629629629, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.5658769607543945, + "kl": 0.05040191859006882, + "learning_rate": 1.7983333333333334e-06, + "loss": -0.0134, + "num_tokens": 1364095.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 4606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.5, + "completions/mean_terminated_length": 4.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 85.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028984514996409416, + "kl": 0.0024870901834219694, + "learning_rate": 1.7980000000000002e-06, + "loss": 0.0001, + "num_tokens": 1364313.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 30.25, + "completions/mean_terminated_length": 30.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 85.33333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6176961660385132, + "kl": 0.1045685987919569, + "learning_rate": 1.7976666666666668e-06, + "loss": -0.1292, + "num_tokens": 1364650.0, + "reward": 4.625, + "reward_std": 2.25, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 2.25, + "step": 4608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 85.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0178164504468441, + "kl": 0.004536781634669751, + "learning_rate": 1.7973333333333335e-06, + "loss": 0.0002, + "num_tokens": 1364908.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 85.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06068643555045128, + "kl": 0.03795307315886021, + "learning_rate": 1.797e-06, + "loss": 0.0019, + "num_tokens": 1365212.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 85.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007539695128798485, + "kl": 0.0004823381605092436, + "learning_rate": 1.7966666666666667e-06, + "loss": 0.0, + "num_tokens": 1365484.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 85.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017650384455919266, + "kl": 0.0019455926958471537, + "learning_rate": 1.7963333333333333e-06, + "loss": 0.0001, + "num_tokens": 1365814.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 7.0, + "completions/mean_terminated_length": 7.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 85.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09376028180122375, + "kl": 0.0026987403398379683, + "learning_rate": 1.796e-06, + "loss": 0.0001, + "num_tokens": 1366062.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.005154639016836882, + "clip_ratio/low_min": 0.005154639016836882, + "clip_ratio/region_mean": 0.005154639016836882, + "completion_length": 33.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 33.75, + "completions/mean_terminated_length": 33.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 85.44444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3227314949035645, + "kl": 0.058972058817744255, + "learning_rate": 1.7956666666666666e-06, + "loss": -0.1058, + "num_tokens": 1366417.0, + "reward": 5.25, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 4.5, + "step": 4614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 85.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04207763075828552, + "kl": 0.01446013996610418, + "learning_rate": 1.7953333333333334e-06, + "loss": 0.0008, + "num_tokens": 1366704.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4615 + }, + { + "clip_ratio/high_max": 0.007936508394777775, + "clip_ratio/high_mean": 0.007936508394777775, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007936508394777775, + "completion_length": 29.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 29.75, + "completions/mean_terminated_length": 29.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 85.48148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.861692428588867, + "kl": 0.11261475086212158, + "learning_rate": 1.7950000000000002e-06, + "loss": -0.0225, + "num_tokens": 1367051.0, + "reward": 5.0, + "reward_std": 3.34165620803833, + "rewards/reward_combined/mean": 5.0, + "rewards/reward_combined/std": 3.34165620803833, + "step": 4616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 33.0, + "completions/mean_terminated_length": 33.0, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 85.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07999681681394577, + "kl": 0.029599539004266262, + "learning_rate": 1.7946666666666667e-06, + "loss": 0.0015, + "num_tokens": 1367407.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 85.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057774126529693604, + "kl": 0.0025075782323256135, + "learning_rate": 1.7943333333333335e-06, + "loss": 0.0001, + "num_tokens": 1367703.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 85.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12313896417617798, + "kl": 0.006098371231928468, + "learning_rate": 1.7939999999999999e-06, + "loss": 0.0003, + "num_tokens": 1368027.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 85.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02504068613052368, + "kl": 0.002125752973370254, + "learning_rate": 1.7936666666666669e-06, + "loss": 0.0001, + "num_tokens": 1368309.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 85.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039043039083480835, + "kl": 0.0010836496949195862, + "learning_rate": 1.7933333333333332e-06, + "loss": 0.0001, + "num_tokens": 1368519.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 85.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05343858152627945, + "kl": 0.0021771948086097836, + "learning_rate": 1.793e-06, + "loss": 0.0001, + "num_tokens": 1368762.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 85.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005909664090722799, + "kl": 0.001699230633676052, + "learning_rate": 1.7926666666666666e-06, + "loss": 0.0001, + "num_tokens": 1369074.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 85.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018692174926400185, + "kl": 0.005533390445634723, + "learning_rate": 1.7923333333333334e-06, + "loss": 0.0003, + "num_tokens": 1369342.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 85.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02741626650094986, + "kl": 0.0006975308060646057, + "learning_rate": 1.7920000000000002e-06, + "loss": 0.0, + "num_tokens": 1369616.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 85.66666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.779372215270996, + "kl": 0.02081705629825592, + "learning_rate": 1.7916666666666667e-06, + "loss": 0.0643, + "num_tokens": 1369912.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 4626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 85.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16113191843032837, + "kl": 0.02319626696407795, + "learning_rate": 1.7913333333333335e-06, + "loss": 0.0011, + "num_tokens": 1370235.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 85.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011597000062465668, + "kl": 0.00022526085376739502, + "learning_rate": 1.7909999999999999e-06, + "loss": 0.0, + "num_tokens": 1370447.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 85.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001522162463515997, + "kl": 0.0004556626081466675, + "learning_rate": 1.7906666666666669e-06, + "loss": 0.0, + "num_tokens": 1370707.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 85.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015761494869366288, + "kl": 5.093216896057129e-05, + "learning_rate": 1.7903333333333332e-06, + "loss": 0.0, + "num_tokens": 1370963.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 85.75925925925925, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.365943431854248, + "kl": 0.07962773321196437, + "learning_rate": 1.79e-06, + "loss": 0.0019, + "num_tokens": 1371235.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 4631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 85.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06607271730899811, + "kl": 0.008443673374131322, + "learning_rate": 1.7896666666666666e-06, + "loss": 0.0004, + "num_tokens": 1371528.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 85.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01242861244827509, + "kl": 0.0035997406812384725, + "learning_rate": 1.7893333333333334e-06, + "loss": 0.0002, + "num_tokens": 1371794.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 85.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08627626299858093, + "kl": 0.00473215157398954, + "learning_rate": 1.7890000000000002e-06, + "loss": 0.0002, + "num_tokens": 1372113.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 85.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07728192210197449, + "kl": 0.16567887365818024, + "learning_rate": 1.7886666666666667e-06, + "loss": 0.0083, + "num_tokens": 1372423.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 85.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009083964861929417, + "kl": 0.00946786068379879, + "learning_rate": 1.7883333333333335e-06, + "loss": 0.0005, + "num_tokens": 1372695.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 85.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20461241900920868, + "kl": 0.07090083882212639, + "learning_rate": 1.7879999999999999e-06, + "loss": 0.0036, + "num_tokens": 1373029.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 85.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7736951112747192, + "kl": 0.04620979353785515, + "learning_rate": 1.7876666666666669e-06, + "loss": 0.0029, + "num_tokens": 1373248.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 85.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09738536924123764, + "kl": 0.13988986611366272, + "learning_rate": 1.7873333333333332e-06, + "loss": 0.007, + "num_tokens": 1373620.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 85.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2058975249528885, + "kl": 0.01700449548661709, + "learning_rate": 1.787e-06, + "loss": 0.0009, + "num_tokens": 1373950.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 85.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003152804565615952, + "kl": 7.610023021697998e-05, + "learning_rate": 1.7866666666666666e-06, + "loss": 0.0, + "num_tokens": 1374170.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 4641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 85.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10333876311779022, + "kl": 0.014639072585850954, + "learning_rate": 1.7863333333333334e-06, + "loss": 0.0007, + "num_tokens": 1374458.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 85.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045813076198101044, + "kl": 0.003015394788235426, + "learning_rate": 1.7860000000000001e-06, + "loss": 0.0002, + "num_tokens": 1374726.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 86.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.121963731944561, + "kl": 0.036465009674429893, + "learning_rate": 1.7856666666666667e-06, + "loss": 0.0018, + "num_tokens": 1375066.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 86.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015926910564303398, + "kl": 0.0005370676517486572, + "learning_rate": 1.7853333333333335e-06, + "loss": 0.0, + "num_tokens": 1375322.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 86.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03132804110646248, + "kl": 0.0014229993685148656, + "learning_rate": 1.785e-06, + "loss": 0.0001, + "num_tokens": 1375643.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 86.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05192486569285393, + "kl": 0.0046487832441926, + "learning_rate": 1.7846666666666668e-06, + "loss": 0.0002, + "num_tokens": 1375941.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 86.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039292074739933014, + "kl": 0.0027387288864701986, + "learning_rate": 1.7843333333333332e-06, + "loss": 0.0001, + "num_tokens": 1376213.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 32.5, + "completions/mean_terminated_length": 32.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 86.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05678660422563553, + "kl": 0.03147179167717695, + "learning_rate": 1.784e-06, + "loss": 0.0016, + "num_tokens": 1376567.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 86.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0512794628739357, + "kl": 0.003787681460380554, + "learning_rate": 1.7836666666666666e-06, + "loss": 0.0002, + "num_tokens": 1376856.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 86.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036692336201667786, + "kl": 0.003396979649551213, + "learning_rate": 1.7833333333333333e-06, + "loss": 0.0002, + "num_tokens": 1377186.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 86.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01630033180117607, + "kl": 0.0018120001768693328, + "learning_rate": 1.7830000000000001e-06, + "loss": 0.0001, + "num_tokens": 1377454.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 86.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08206231892108917, + "kl": 0.04010830633342266, + "learning_rate": 1.7826666666666667e-06, + "loss": 0.002, + "num_tokens": 1377756.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 86.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11931747198104858, + "kl": 0.0059440258191898465, + "learning_rate": 1.7823333333333335e-06, + "loss": 0.0004, + "num_tokens": 1378023.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 86.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3945906460285187, + "kl": 0.03288387740030885, + "learning_rate": 1.782e-06, + "loss": 0.0017, + "num_tokens": 1378318.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 36.25, + "completions/mean_terminated_length": 36.25, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 86.22222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3630080223083496, + "kl": 0.061734676361083984, + "learning_rate": 1.7816666666666668e-06, + "loss": -0.0927, + "num_tokens": 1378691.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 4656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 86.24074074074075, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.374413013458252, + "kl": 0.07402043789625168, + "learning_rate": 1.7813333333333332e-06, + "loss": 0.0802, + "num_tokens": 1378976.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 4657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 86.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005743706598877907, + "kl": 0.0003885440528392792, + "learning_rate": 1.781e-06, + "loss": 0.0, + "num_tokens": 1379236.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 86.27777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.250213384628296, + "kl": 0.08220694027841091, + "learning_rate": 1.7806666666666665e-06, + "loss": -0.019, + "num_tokens": 1379536.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 4659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 86.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036384690552949905, + "kl": 0.0015540399472229183, + "learning_rate": 1.7803333333333333e-06, + "loss": 0.0001, + "num_tokens": 1379779.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 86.31481481481481, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.386549949645996, + "kl": 0.0676152752712369, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.0169, + "num_tokens": 1380088.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 4661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 86.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003157753963023424, + "kl": 7.578730583190918e-05, + "learning_rate": 1.7796666666666667e-06, + "loss": 0.0, + "num_tokens": 1380308.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 4662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 86.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18242207169532776, + "kl": 0.021608256734907627, + "learning_rate": 1.7793333333333335e-06, + "loss": 0.0011, + "num_tokens": 1380639.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 34.5, + "completions/mean_terminated_length": 34.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 86.37037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.346588134765625, + "kl": 0.09413415193557739, + "learning_rate": 1.779e-06, + "loss": 0.0493, + "num_tokens": 1380993.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 4664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 86.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02952669747173786, + "kl": 0.0024364853743463755, + "learning_rate": 1.7786666666666668e-06, + "loss": 0.0001, + "num_tokens": 1381228.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 86.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09102169424295425, + "kl": 0.011722934752469882, + "learning_rate": 1.7783333333333332e-06, + "loss": 0.0006, + "num_tokens": 1381500.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 86.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06333321332931519, + "kl": 0.003842551843263209, + "learning_rate": 1.7780000000000002e-06, + "loss": 0.0002, + "num_tokens": 1381811.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 86.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24839980900287628, + "kl": 0.04897315055131912, + "learning_rate": 1.7776666666666665e-06, + "loss": 0.0025, + "num_tokens": 1382087.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 86.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18892042338848114, + "kl": 0.025432278867810965, + "learning_rate": 1.7773333333333333e-06, + "loss": 0.0013, + "num_tokens": 1382377.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 86.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06789866089820862, + "kl": 0.019855202175676823, + "learning_rate": 1.777e-06, + "loss": 0.001, + "num_tokens": 1382664.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 86.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03407621011137962, + "kl": 0.05034089274704456, + "learning_rate": 1.7766666666666667e-06, + "loss": 0.0025, + "num_tokens": 1383004.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 40.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 40.5, + "completions/mean_terminated_length": 40.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 86.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09539736062288284, + "kl": 0.0142113221809268, + "learning_rate": 1.7763333333333335e-06, + "loss": 0.0008, + "num_tokens": 1383386.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 86.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01676495186984539, + "kl": 0.002237536944448948, + "learning_rate": 1.776e-06, + "loss": 0.0001, + "num_tokens": 1383698.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 86.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005544695071876049, + "kl": 0.01563091389834881, + "learning_rate": 1.7756666666666668e-06, + "loss": 0.0008, + "num_tokens": 1383958.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 86.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17210577428340912, + "kl": 0.01150283170863986, + "learning_rate": 1.7753333333333332e-06, + "loss": 0.0006, + "num_tokens": 1384285.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 86.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010064753703773022, + "kl": 0.0001001238779281266, + "learning_rate": 1.7750000000000002e-06, + "loss": 0.0, + "num_tokens": 1384541.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 86.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10454939305782318, + "kl": 0.0301426500082016, + "learning_rate": 1.7746666666666665e-06, + "loss": 0.0015, + "num_tokens": 1384869.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 86.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01666002906858921, + "kl": 0.0014522984565701336, + "learning_rate": 1.7743333333333333e-06, + "loss": 0.0001, + "num_tokens": 1385088.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 86.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044514428824186325, + "kl": 0.0018117026193067431, + "learning_rate": 1.774e-06, + "loss": 0.0001, + "num_tokens": 1385372.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 86.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018677186220884323, + "kl": 0.005287077045068145, + "learning_rate": 1.7736666666666667e-06, + "loss": 0.0003, + "num_tokens": 1385640.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 27.75, + "completions/mean_terminated_length": 27.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 86.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051670122891664505, + "kl": 0.014468851499259472, + "learning_rate": 1.7733333333333334e-06, + "loss": 0.0007, + "num_tokens": 1386003.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 86.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0088161900639534, + "kl": 0.009596351999789476, + "learning_rate": 1.773e-06, + "loss": 0.0005, + "num_tokens": 1386275.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 86.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008560660295188427, + "kl": 0.00012165521184215322, + "learning_rate": 1.7726666666666668e-06, + "loss": 0.0, + "num_tokens": 1386555.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 86.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004640623927116394, + "kl": 0.0014469127054326236, + "learning_rate": 1.7723333333333331e-06, + "loss": 0.0001, + "num_tokens": 1386771.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 86.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06340131163597107, + "kl": 0.009800476138480008, + "learning_rate": 1.7720000000000001e-06, + "loss": 0.0005, + "num_tokens": 1387069.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 86.77777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.437071323394775, + "kl": 0.00455561134731397, + "learning_rate": 1.7716666666666665e-06, + "loss": 0.0831, + "num_tokens": 1387358.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 4686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 86.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03959748148918152, + "kl": 0.07488290406763554, + "learning_rate": 1.7713333333333333e-06, + "loss": 0.0038, + "num_tokens": 1387729.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 86.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10948716849088669, + "kl": 0.024760215543210506, + "learning_rate": 1.771e-06, + "loss": 0.0013, + "num_tokens": 1388026.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 12.25, + "completions/mean_terminated_length": 12.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 86.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1482488363981247, + "kl": 0.020993283949792385, + "learning_rate": 1.7706666666666666e-06, + "loss": 0.001, + "num_tokens": 1388303.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 86.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009246274130418897, + "kl": 0.0037707313895225525, + "learning_rate": 1.7703333333333334e-06, + "loss": 0.0002, + "num_tokens": 1388539.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 33.25, + "completions/mean_terminated_length": 33.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 86.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03788057342171669, + "kl": 0.041045090183615685, + "learning_rate": 1.77e-06, + "loss": 0.002, + "num_tokens": 1388952.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 4691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 86.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007056123577058315, + "kl": 0.0044207972241565585, + "learning_rate": 1.7696666666666668e-06, + "loss": 0.0002, + "num_tokens": 1389210.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 86.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009064299054443836, + "kl": 0.0008456170617137104, + "learning_rate": 1.7693333333333333e-06, + "loss": 0.0, + "num_tokens": 1389470.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 86.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023429162800312042, + "kl": 0.00045727938413619995, + "learning_rate": 1.7690000000000001e-06, + "loss": 0.0, + "num_tokens": 1389680.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 30.25, + "completions/mean_terminated_length": 30.25, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 86.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21100997924804688, + "kl": 0.05531185492873192, + "learning_rate": 1.768666666666667e-06, + "loss": 0.0028, + "num_tokens": 1390021.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 86.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025573842227458954, + "kl": 0.0002676844596862793, + "learning_rate": 1.7683333333333333e-06, + "loss": 0.0, + "num_tokens": 1390234.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 86.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04619613289833069, + "kl": 0.011511936318129301, + "learning_rate": 1.7680000000000003e-06, + "loss": 0.0005, + "num_tokens": 1390566.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 87.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00197571306489408, + "kl": 0.26865261793136597, + "learning_rate": 1.7676666666666666e-06, + "loss": 0.0134, + "num_tokens": 1390870.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 87.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18719759583473206, + "kl": 0.041125768795609474, + "learning_rate": 1.7673333333333334e-06, + "loss": 0.0021, + "num_tokens": 1391196.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 87.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07855581492185593, + "kl": 0.01782261545304209, + "learning_rate": 1.767e-06, + "loss": 0.0009, + "num_tokens": 1391527.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 87.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06112566962838173, + "kl": 0.0025870645185932517, + "learning_rate": 1.7666666666666668e-06, + "loss": 0.0001, + "num_tokens": 1391825.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 34.25, + "completions/mean_terminated_length": 34.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 87.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14201121032238007, + "kl": 0.055685851722955704, + "learning_rate": 1.7663333333333333e-06, + "loss": 0.0027, + "num_tokens": 1392242.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 4702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 87.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024855894967913628, + "kl": 0.010527586564421654, + "learning_rate": 1.7660000000000001e-06, + "loss": 0.0005, + "num_tokens": 1392514.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 87.11111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 12.536376953125, + "kl": 0.004537135362625122, + "learning_rate": 1.765666666666667e-06, + "loss": -0.0005, + "num_tokens": 1392774.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 4704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 87.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0020892838947474957, + "kl": 0.2686363011598587, + "learning_rate": 1.7653333333333333e-06, + "loss": 0.0134, + "num_tokens": 1393078.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 28.5, + "completions/mean_terminated_length": 28.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 87.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15940794348716736, + "kl": 0.03647000528872013, + "learning_rate": 1.7650000000000003e-06, + "loss": 0.0019, + "num_tokens": 1393420.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 87.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24599826335906982, + "kl": 0.014139835315290838, + "learning_rate": 1.7646666666666666e-06, + "loss": 0.0007, + "num_tokens": 1393663.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 87.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009678980335593224, + "kl": 0.0008582860173191875, + "learning_rate": 1.7643333333333334e-06, + "loss": 0.0, + "num_tokens": 1393923.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 87.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0481424406170845, + "kl": 0.0011678288865368813, + "learning_rate": 1.764e-06, + "loss": 0.0, + "num_tokens": 1394143.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 87.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0892491266131401, + "kl": 0.017186392098665237, + "learning_rate": 1.7636666666666667e-06, + "loss": 0.0009, + "num_tokens": 1394465.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 87.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09247300773859024, + "kl": 0.02190759778022766, + "learning_rate": 1.7633333333333333e-06, + "loss": 0.0012, + "num_tokens": 1394743.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 87.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0053375172428786755, + "kl": 0.0009459902939852327, + "learning_rate": 1.763e-06, + "loss": 0.0, + "num_tokens": 1395055.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 87.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13784445822238922, + "kl": 0.025329535827040672, + "learning_rate": 1.7626666666666669e-06, + "loss": 0.0013, + "num_tokens": 1395358.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 87.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031100338324904442, + "kl": 0.004801976203452796, + "learning_rate": 1.7623333333333335e-06, + "loss": 0.0002, + "num_tokens": 1395616.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.25, + "completions/mean_terminated_length": 6.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 87.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11222835630178452, + "kl": 0.004167563864029944, + "learning_rate": 1.7620000000000002e-06, + "loss": 0.0002, + "num_tokens": 1395849.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 87.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009320286917500198, + "kl": 0.003768324851989746, + "learning_rate": 1.7616666666666666e-06, + "loss": 0.0002, + "num_tokens": 1396085.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 36.75, + "completions/mean_terminated_length": 36.75, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 87.35185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.992314100265503, + "kl": 0.07456966117024422, + "learning_rate": 1.7613333333333334e-06, + "loss": 0.0238, + "num_tokens": 1396448.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 4717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 87.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014076169580221176, + "kl": 0.0014486652798950672, + "learning_rate": 1.761e-06, + "loss": 0.0001, + "num_tokens": 1396776.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 87.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01541076973080635, + "kl": 0.0023488476872444153, + "learning_rate": 1.7606666666666667e-06, + "loss": 0.0001, + "num_tokens": 1397088.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 87.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07820127159357071, + "kl": 0.0077334127854555845, + "learning_rate": 1.7603333333333333e-06, + "loss": 0.0004, + "num_tokens": 1397388.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 87.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03230593726038933, + "kl": 0.001247149455593899, + "learning_rate": 1.76e-06, + "loss": 0.0001, + "num_tokens": 1397645.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 87.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07705104351043701, + "kl": 0.0156064429320395, + "learning_rate": 1.7596666666666669e-06, + "loss": 0.0008, + "num_tokens": 1397923.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 87.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8879602551460266, + "kl": 0.1230792049318552, + "learning_rate": 1.7593333333333334e-06, + "loss": 0.0061, + "num_tokens": 1398221.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 73.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.0, + "completions/max_terminated_length": 191.0, + "completions/mean_length": 73.75, + "completions/mean_terminated_length": 73.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 87.48148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8353389501571655, + "kl": 0.09082326479256153, + "learning_rate": 1.7590000000000002e-06, + "loss": 0.332, + "num_tokens": 1398736.0, + "reward": 5.550000190734863, + "reward_std": 3.9000003337860107, + "rewards/reward_combined/mean": 5.550000190734863, + "rewards/reward_combined/std": 3.9000000953674316, + "step": 4724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 39.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 39.5, + "completions/mean_terminated_length": 39.5, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 87.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14337654411792755, + "kl": 0.02798518445342779, + "learning_rate": 1.7586666666666666e-06, + "loss": 0.0014, + "num_tokens": 1399118.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 87.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03855842724442482, + "kl": 0.0004110857844352722, + "learning_rate": 1.7583333333333334e-06, + "loss": 0.0, + "num_tokens": 1399330.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 87.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006476222071796656, + "kl": 0.0005181431770324707, + "learning_rate": 1.758e-06, + "loss": 0.0, + "num_tokens": 1399590.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 87.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060850590467453, + "kl": 0.0025919199688360095, + "learning_rate": 1.7576666666666667e-06, + "loss": 0.0001, + "num_tokens": 1399918.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 87.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.34920141100883484, + "kl": 0.03598490194417536, + "learning_rate": 1.7573333333333333e-06, + "loss": 0.002, + "num_tokens": 1400213.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 87.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005956654902547598, + "kl": 0.015529973432421684, + "learning_rate": 1.757e-06, + "loss": 0.0008, + "num_tokens": 1400473.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 87.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0025009100791066885, + "kl": 0.003056600457057357, + "learning_rate": 1.7566666666666669e-06, + "loss": 0.0001, + "num_tokens": 1400741.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 87.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.42736586928367615, + "kl": 0.08293714374303818, + "learning_rate": 1.7563333333333334e-06, + "loss": 0.0037, + "num_tokens": 1401071.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 87.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0065531861037015915, + "kl": 0.0003086984215769917, + "learning_rate": 1.7560000000000002e-06, + "loss": 0.0, + "num_tokens": 1401291.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 87.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049356941133737564, + "kl": 0.16276530921459198, + "learning_rate": 1.7556666666666666e-06, + "loss": 0.0081, + "num_tokens": 1401601.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 87.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.47890207171440125, + "kl": 0.018510058522224426, + "learning_rate": 1.7553333333333334e-06, + "loss": 0.0009, + "num_tokens": 1401817.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 87.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07846085727214813, + "kl": 0.02969266881700605, + "learning_rate": 1.755e-06, + "loss": 0.0015, + "num_tokens": 1402105.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 87.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01720520481467247, + "kl": 0.06209162622690201, + "learning_rate": 1.7546666666666667e-06, + "loss": 0.0031, + "num_tokens": 1402437.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 87.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033414989709854126, + "kl": 0.0066407660488039255, + "learning_rate": 1.7543333333333333e-06, + "loss": 0.0003, + "num_tokens": 1402769.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 87.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027305880561470985, + "kl": 0.0009377330643474124, + "learning_rate": 1.754e-06, + "loss": 0.0, + "num_tokens": 1403039.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 87.77777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.2385735511779785, + "kl": 0.030476846266537905, + "learning_rate": 1.7536666666666668e-06, + "loss": 0.209, + "num_tokens": 1403341.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 4740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 32.0, + "completions/mean_terminated_length": 32.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 87.79629629629629, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.8092451095581055, + "kl": 0.2841898649930954, + "learning_rate": 1.7533333333333334e-06, + "loss": -0.119, + "num_tokens": 1403697.0, + "reward": 6.625, + "reward_std": 2.428133726119995, + "rewards/reward_combined/mean": 6.625, + "rewards/reward_combined/std": 2.428133726119995, + "step": 4741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 87.81481481481481, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.387889862060547, + "kl": 0.1311328737065196, + "learning_rate": 1.7530000000000002e-06, + "loss": 0.0091, + "num_tokens": 1403971.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 4742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 87.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023371785879135132, + "kl": 0.005181870190426707, + "learning_rate": 1.7526666666666666e-06, + "loss": 0.0003, + "num_tokens": 1404259.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 87.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0339960977435112, + "kl": 0.005621553864330053, + "learning_rate": 1.7523333333333336e-06, + "loss": 0.0003, + "num_tokens": 1404550.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 87.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002939185651484877, + "kl": 8.12336802482605e-05, + "learning_rate": 1.752e-06, + "loss": 0.0, + "num_tokens": 1404770.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 4745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 87.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010580105008557439, + "kl": 4.505366086959839e-05, + "learning_rate": 1.7516666666666667e-06, + "loss": 0.0, + "num_tokens": 1404982.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 87.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2652187943458557, + "kl": 0.0241744231316261, + "learning_rate": 1.7513333333333333e-06, + "loss": 0.0011, + "num_tokens": 1405244.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 87.92592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6330318450927734, + "kl": 0.8132757358253002, + "learning_rate": 1.751e-06, + "loss": 0.0205, + "num_tokens": 1405612.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 4748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 87.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1833159178495407, + "kl": 0.03436026722192764, + "learning_rate": 1.7506666666666668e-06, + "loss": 0.0017, + "num_tokens": 1405884.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 87.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12061919271945953, + "kl": 0.0028928687097504735, + "learning_rate": 1.7503333333333334e-06, + "loss": 0.0001, + "num_tokens": 1406156.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 87.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03356965258717537, + "kl": 0.0042511168867349625, + "learning_rate": 1.7500000000000002e-06, + "loss": 0.0002, + "num_tokens": 1406433.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 88.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.9887139797210693, + "kl": 0.019847046583890915, + "learning_rate": 1.7496666666666665e-06, + "loss": -0.0394, + "num_tokens": 1406739.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 4752 + }, + { + "clip_ratio/high_max": 0.01515151560306549, + "clip_ratio/high_mean": 0.01515151560306549, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01515151560306549, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 88.01851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.949356079101562, + "kl": 0.024055887013673782, + "learning_rate": 1.7493333333333335e-06, + "loss": 0.1458, + "num_tokens": 1407020.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 4753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 88.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031621236354112625, + "kl": 0.0035676882253028452, + "learning_rate": 1.749e-06, + "loss": 0.0002, + "num_tokens": 1407315.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 88.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012054701335728168, + "kl": 0.0038182729622349143, + "learning_rate": 1.7486666666666667e-06, + "loss": 0.0002, + "num_tokens": 1407605.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 88.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02661200799047947, + "kl": 0.0019001525943167508, + "learning_rate": 1.7483333333333333e-06, + "loss": 0.0001, + "num_tokens": 1407931.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 88.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040140535682439804, + "kl": 0.002985157072544098, + "learning_rate": 1.748e-06, + "loss": 0.0001, + "num_tokens": 1408213.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 29.0, + "completions/mean_terminated_length": 29.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 88.11111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.893159866333008, + "kl": 0.17455535382032394, + "learning_rate": 1.7476666666666668e-06, + "loss": 0.0607, + "num_tokens": 1408565.0, + "reward": 4.25, + "reward_std": 4.27200174331665, + "rewards/reward_combined/mean": 4.25, + "rewards/reward_combined/std": 4.27200174331665, + "step": 4758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 88.12962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9771177768707275, + "kl": 0.10692603280767798, + "learning_rate": 1.7473333333333334e-06, + "loss": -0.0245, + "num_tokens": 1408849.0, + "reward": 7.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.875, + "rewards/reward_combined/std": 0.25, + "step": 4759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 12.75, + "completions/mean_terminated_length": 12.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 88.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.2160401344299316, + "kl": 0.20149564469465986, + "learning_rate": 1.7470000000000002e-06, + "loss": 0.0125, + "num_tokens": 1409124.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 88.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10250312089920044, + "kl": 0.00881947239395231, + "learning_rate": 1.7466666666666665e-06, + "loss": 0.0004, + "num_tokens": 1409384.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 88.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09445659071207047, + "kl": 0.04594658687710762, + "learning_rate": 1.7463333333333335e-06, + "loss": 0.0023, + "num_tokens": 1409684.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 88.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015238243155181408, + "kl": 0.0001875132293207571, + "learning_rate": 1.7459999999999999e-06, + "loss": 0.0, + "num_tokens": 1409940.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 88.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07152358442544937, + "kl": 0.024334699905011803, + "learning_rate": 1.7456666666666667e-06, + "loss": 0.0012, + "num_tokens": 1410228.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 41.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 41.25, + "completions/mean_terminated_length": 41.25, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 88.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09340780973434448, + "kl": 0.03230349626392126, + "learning_rate": 1.7453333333333332e-06, + "loss": 0.0013, + "num_tokens": 1410613.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 88.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12714384496212006, + "kl": 0.11031162738800049, + "learning_rate": 1.745e-06, + "loss": 0.0055, + "num_tokens": 1410985.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 88.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07792849838733673, + "kl": 0.011701170820742846, + "learning_rate": 1.7446666666666668e-06, + "loss": 0.0006, + "num_tokens": 1411315.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 88.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06055062264204025, + "kl": 0.0009274661424569786, + "learning_rate": 1.7443333333333334e-06, + "loss": 0.0, + "num_tokens": 1411585.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 34.75, + "completions/mean_terminated_length": 34.75, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 88.31481481481481, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.668433904647827, + "kl": 0.05290278419852257, + "learning_rate": 1.7440000000000002e-06, + "loss": -0.0847, + "num_tokens": 1411940.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 4769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 88.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009742482216097414, + "kl": 0.003757782280445099, + "learning_rate": 1.7436666666666667e-06, + "loss": 0.0002, + "num_tokens": 1412176.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 88.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007873695343732834, + "kl": 0.0016839823802001774, + "learning_rate": 1.7433333333333335e-06, + "loss": 0.0001, + "num_tokens": 1412456.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 88.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004643014166504145, + "kl": 0.00017097840463975444, + "learning_rate": 1.7429999999999999e-06, + "loss": 0.0, + "num_tokens": 1412728.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 30.0, + "completions/mean_terminated_length": 30.0, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 88.38888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1629791259765625, + "kl": 0.22995993122458458, + "learning_rate": 1.7426666666666667e-06, + "loss": -0.0238, + "num_tokens": 1413128.0, + "reward": 2.375, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 1.25, + "step": 4773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 88.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0317719392478466, + "kl": 0.001265214232262224, + "learning_rate": 1.7423333333333332e-06, + "loss": 0.0001, + "num_tokens": 1413400.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 88.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008855005726218224, + "kl": 0.009439130313694477, + "learning_rate": 1.742e-06, + "loss": 0.0005, + "num_tokens": 1413672.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 88.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03459019213914871, + "kl": 0.0020393177692312747, + "learning_rate": 1.7416666666666668e-06, + "loss": 0.0001, + "num_tokens": 1413891.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4776 + }, + { + "clip_ratio/high_max": 0.014705882407724857, + "clip_ratio/high_mean": 0.014705882407724857, + "clip_ratio/low_mean": 0.014705882407724857, + "clip_ratio/low_min": 0.014705882407724857, + "clip_ratio/region_mean": 0.029411764815449715, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 88.46296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4239726066589355, + "kl": 0.06648285128176212, + "learning_rate": 1.7413333333333334e-06, + "loss": -0.1482, + "num_tokens": 1414189.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 4777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 39.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 39.25, + "completions/mean_terminated_length": 39.25, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 88.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06055663526058197, + "kl": 0.07100075110793114, + "learning_rate": 1.7410000000000001e-06, + "loss": 0.0035, + "num_tokens": 1414574.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 88.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029829610139131546, + "kl": 7.943809032440186e-05, + "learning_rate": 1.7406666666666667e-06, + "loss": 0.0, + "num_tokens": 1414794.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 4779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 88.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006371496245265007, + "kl": 0.00045894537470303476, + "learning_rate": 1.7403333333333335e-06, + "loss": 0.0, + "num_tokens": 1415115.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 88.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11365672200918198, + "kl": 0.0212111659348011, + "learning_rate": 1.7399999999999999e-06, + "loss": 0.0011, + "num_tokens": 1415412.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 88.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005644958931952715, + "kl": 0.000581253319978714, + "learning_rate": 1.7396666666666666e-06, + "loss": 0.0, + "num_tokens": 1415672.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 88.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04187702015042305, + "kl": 0.004522688686847687, + "learning_rate": 1.7393333333333332e-06, + "loss": 0.0002, + "num_tokens": 1416002.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 88.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009981738403439522, + "kl": 0.00011271610856056213, + "learning_rate": 1.739e-06, + "loss": 0.0, + "num_tokens": 1416246.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 88.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015374717302620411, + "kl": 0.0005145706236362457, + "learning_rate": 1.7386666666666668e-06, + "loss": 0.0, + "num_tokens": 1416508.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 5.75, + "completions/mean_terminated_length": 5.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 88.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16972176730632782, + "kl": 0.018287737853825092, + "learning_rate": 1.7383333333333333e-06, + "loss": 0.0012, + "num_tokens": 1416731.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.25, + "completions/mean_terminated_length": 6.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 88.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02892535738646984, + "kl": 0.000577746395720169, + "learning_rate": 1.7380000000000001e-06, + "loss": 0.0, + "num_tokens": 1416964.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 88.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023547876626253128, + "kl": 0.0006417706608772278, + "learning_rate": 1.7376666666666667e-06, + "loss": 0.0, + "num_tokens": 1417174.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 88.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07347511500120163, + "kl": 0.008679481688886881, + "learning_rate": 1.7373333333333335e-06, + "loss": 0.0004, + "num_tokens": 1417462.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 88.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0788622796535492, + "kl": 0.004397721262648702, + "learning_rate": 1.7369999999999998e-06, + "loss": 0.0002, + "num_tokens": 1417758.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 88.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08594287186861038, + "kl": 0.014041016809642315, + "learning_rate": 1.7366666666666668e-06, + "loss": 0.0007, + "num_tokens": 1418087.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 88.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09737042337656021, + "kl": 0.015641923062503338, + "learning_rate": 1.7363333333333332e-06, + "loss": 0.0008, + "num_tokens": 1418395.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 29.25, + "completions/mean_terminated_length": 29.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 88.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17543596029281616, + "kl": 0.036995720118284225, + "learning_rate": 1.736e-06, + "loss": 0.0019, + "num_tokens": 1418732.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 88.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0032571146730333567, + "kl": 0.2684212028980255, + "learning_rate": 1.7356666666666668e-06, + "loss": 0.0134, + "num_tokens": 1419036.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 88.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03074190579354763, + "kl": 0.000304393470287323, + "learning_rate": 1.7353333333333333e-06, + "loss": 0.0, + "num_tokens": 1419248.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 88.81481481481481, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.21857213973999, + "kl": 0.030106719117611647, + "learning_rate": 1.7350000000000001e-06, + "loss": -0.0121, + "num_tokens": 1419565.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 4796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 88.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06326611340045929, + "kl": 0.009333632420748472, + "learning_rate": 1.7346666666666667e-06, + "loss": 0.0005, + "num_tokens": 1419839.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 88.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14643755555152893, + "kl": 0.052632153034210205, + "learning_rate": 1.7343333333333335e-06, + "loss": 0.0026, + "num_tokens": 1420111.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 88.87037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.514625549316406, + "kl": 0.03136721812188625, + "learning_rate": 1.7339999999999998e-06, + "loss": 0.1059, + "num_tokens": 1420469.0, + "reward": 4.125, + "reward_std": 3.902456521987915, + "rewards/reward_combined/mean": 4.125, + "rewards/reward_combined/std": 3.902456521987915, + "step": 4799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 88.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026346635073423386, + "kl": 0.0018991194665431976, + "learning_rate": 1.7336666666666668e-06, + "loss": 0.0001, + "num_tokens": 1420781.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 88.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01011976134032011, + "kl": 0.16344992071390152, + "learning_rate": 1.7333333333333332e-06, + "loss": 0.0082, + "num_tokens": 1421089.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 88.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04138335585594177, + "kl": 0.003717821091413498, + "learning_rate": 1.733e-06, + "loss": 0.0002, + "num_tokens": 1421411.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 88.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09094447642564774, + "kl": 0.021962124854326248, + "learning_rate": 1.7326666666666667e-06, + "loss": 0.0012, + "num_tokens": 1421688.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 9.75, + "completions/mean_terminated_length": 9.75, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 88.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13578258454799652, + "kl": 0.019133458845317364, + "learning_rate": 1.7323333333333333e-06, + "loss": 0.001, + "num_tokens": 1421951.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 38.0, + "completions/mean_terminated_length": 38.0, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 88.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04140186682343483, + "kl": 0.02233363315463066, + "learning_rate": 1.732e-06, + "loss": 0.0011, + "num_tokens": 1422327.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 89.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07996660470962524, + "kl": 0.001408204436302185, + "learning_rate": 1.7316666666666667e-06, + "loss": 0.0001, + "num_tokens": 1422547.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 89.01851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.286879062652588, + "kl": 0.015015662327641621, + "learning_rate": 1.7313333333333335e-06, + "loss": 0.0889, + "num_tokens": 1422824.0, + "reward": 5.875, + "reward_std": 3.25, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 3.25, + "step": 4807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 89.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0022143719252198935, + "kl": 0.0004230029881000519, + "learning_rate": 1.7309999999999998e-06, + "loss": 0.0, + "num_tokens": 1423084.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 39.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 39.75, + "completions/mean_terminated_length": 39.75, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 89.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12269677966833115, + "kl": 0.03959126025438309, + "learning_rate": 1.7306666666666668e-06, + "loss": 0.002, + "num_tokens": 1423467.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 89.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016634922474622726, + "kl": 0.023318459207075648, + "learning_rate": 1.7303333333333332e-06, + "loss": 0.0012, + "num_tokens": 1423756.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 89.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02401517890393734, + "kl": 0.0019014648860320449, + "learning_rate": 1.73e-06, + "loss": 0.0001, + "num_tokens": 1424036.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 89.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06616130471229553, + "kl": 0.015449159778654575, + "learning_rate": 1.7296666666666667e-06, + "loss": 0.0008, + "num_tokens": 1424340.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 89.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06612984836101532, + "kl": 0.03784245811402798, + "learning_rate": 1.7293333333333333e-06, + "loss": 0.0019, + "num_tokens": 1424642.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 89.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013312330469489098, + "kl": 0.018647802527993917, + "learning_rate": 1.729e-06, + "loss": 0.0009, + "num_tokens": 1424920.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 30.5, + "completions/mean_terminated_length": 30.5, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 89.16666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3609471321105957, + "kl": 0.26293565332889557, + "learning_rate": 1.7286666666666667e-06, + "loss": 0.055, + "num_tokens": 1425270.0, + "reward": 4.75, + "reward_std": 3.4034295082092285, + "rewards/reward_combined/mean": 4.75, + "rewards/reward_combined/std": 3.4034297466278076, + "step": 4815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 89.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0033325408585369587, + "kl": 0.26840245723724365, + "learning_rate": 1.7283333333333334e-06, + "loss": 0.0134, + "num_tokens": 1425574.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 89.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011171288788318634, + "kl": 0.00010593980550765991, + "learning_rate": 1.728e-06, + "loss": 0.0, + "num_tokens": 1425818.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 89.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012760492041707039, + "kl": 0.05843185447156429, + "learning_rate": 1.7276666666666668e-06, + "loss": 0.0029, + "num_tokens": 1426150.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 89.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010549293830990791, + "kl": 0.014556619804352522, + "learning_rate": 1.7273333333333336e-06, + "loss": 0.0007, + "num_tokens": 1426410.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 89.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006148915272206068, + "kl": 0.00037801267171744257, + "learning_rate": 1.727e-06, + "loss": 0.0, + "num_tokens": 1426670.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 89.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050276126712560654, + "kl": 0.0018282572855241597, + "learning_rate": 1.726666666666667e-06, + "loss": 0.0001, + "num_tokens": 1426937.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 89.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0307657178491354, + "kl": 0.00029872357845306396, + "learning_rate": 1.7263333333333333e-06, + "loss": 0.0, + "num_tokens": 1427149.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 89.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00695132277905941, + "kl": 0.00042761834629345685, + "learning_rate": 1.726e-06, + "loss": 0.0, + "num_tokens": 1427467.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 89.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029064415022730827, + "kl": 0.003741058288142085, + "learning_rate": 1.7256666666666666e-06, + "loss": 0.0002, + "num_tokens": 1427796.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 89.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.9398007392883301, + "kl": 0.09074857085943222, + "learning_rate": 1.7253333333333334e-06, + "loss": 0.0045, + "num_tokens": 1428016.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 4825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 89.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0314173623919487, + "kl": 0.002311806194484234, + "learning_rate": 1.725e-06, + "loss": 0.0001, + "num_tokens": 1428300.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 2.5, + "completions/mean_terminated_length": 2.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 89.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03163344785571098, + "kl": 0.0005777254700660706, + "learning_rate": 1.7246666666666668e-06, + "loss": 0.0, + "num_tokens": 1428506.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 89.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005189788527786732, + "kl": 0.0004135353665333241, + "learning_rate": 1.7243333333333336e-06, + "loss": 0.0, + "num_tokens": 1428818.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 89.42592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7829396724700928, + "kl": 0.062060149386525154, + "learning_rate": 1.724e-06, + "loss": 0.005, + "num_tokens": 1429106.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 4829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.0, + "completions/max_terminated_length": 46.0, + "completions/mean_length": 38.75, + "completions/mean_terminated_length": 38.75, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 89.44444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7605483531951904, + "kl": 0.107602808624506, + "learning_rate": 1.723666666666667e-06, + "loss": 0.0065, + "num_tokens": 1429477.0, + "reward": 5.625, + "reward_std": 2.75, + "rewards/reward_combined/mean": 5.625, + "rewards/reward_combined/std": 2.75, + "step": 4830 + }, + { + "clip_ratio/high_max": 0.01315789483487606, + "clip_ratio/high_mean": 0.01315789483487606, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01315789483487606, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 89.46296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.245172500610352, + "kl": 0.05810554325580597, + "learning_rate": 1.7233333333333333e-06, + "loss": 0.137, + "num_tokens": 1429762.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 4831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 89.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01007368229329586, + "kl": 0.1634574607014656, + "learning_rate": 1.723e-06, + "loss": 0.0082, + "num_tokens": 1430070.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 89.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05980310216546059, + "kl": 0.0070490543730556965, + "learning_rate": 1.7226666666666666e-06, + "loss": 0.0004, + "num_tokens": 1430364.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 89.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006359180435538292, + "kl": 0.00012331008838373236, + "learning_rate": 1.7223333333333334e-06, + "loss": 0.0, + "num_tokens": 1430620.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 89.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026532836258411407, + "kl": 0.0018375739455223083, + "learning_rate": 1.722e-06, + "loss": 0.0001, + "num_tokens": 1430932.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 89.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027995651587843895, + "kl": 0.004782599047757685, + "learning_rate": 1.7216666666666668e-06, + "loss": 0.0002, + "num_tokens": 1431227.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 89.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051071785390377045, + "kl": 0.008735061157494783, + "learning_rate": 1.7213333333333336e-06, + "loss": 0.0004, + "num_tokens": 1431500.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 89.5925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.852427959442139, + "kl": 0.016891013365238905, + "learning_rate": 1.721e-06, + "loss": 0.1198, + "num_tokens": 1431783.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 4838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 89.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028023596853017807, + "kl": 0.007031596032902598, + "learning_rate": 1.720666666666667e-06, + "loss": 0.0004, + "num_tokens": 1432076.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 89.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.31240132451057434, + "kl": 0.01999014150351286, + "learning_rate": 1.7203333333333333e-06, + "loss": 0.001, + "num_tokens": 1432374.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 34.75, + "completions/mean_terminated_length": 34.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 89.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09493028372526169, + "kl": 0.06407080590724945, + "learning_rate": 1.72e-06, + "loss": 0.0033, + "num_tokens": 1432793.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 4841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 89.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007937569171190262, + "kl": 0.0008932113996706903, + "learning_rate": 1.7196666666666666e-06, + "loss": 0.0, + "num_tokens": 1433013.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 89.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010140584781765938, + "kl": 0.00374448299407959, + "learning_rate": 1.7193333333333334e-06, + "loss": 0.0002, + "num_tokens": 1433249.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 9.75, + "completions/mean_terminated_length": 9.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 89.70370370370371, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.036374092102051, + "kl": 0.08073539473116398, + "learning_rate": 1.719e-06, + "loss": 0.1016, + "num_tokens": 1433508.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 4844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 89.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02493441291153431, + "kl": 0.0010606646537780762, + "learning_rate": 1.7186666666666668e-06, + "loss": 0.0001, + "num_tokens": 1433720.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 89.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17476995289325714, + "kl": 0.018154183868318796, + "learning_rate": 1.7183333333333335e-06, + "loss": 0.0009, + "num_tokens": 1434010.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 89.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09208376705646515, + "kl": 0.006146706640720367, + "learning_rate": 1.7180000000000001e-06, + "loss": 0.0003, + "num_tokens": 1434226.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 89.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10774128139019012, + "kl": 0.019184167496860027, + "learning_rate": 1.7176666666666669e-06, + "loss": 0.001, + "num_tokens": 1434559.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 89.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01987144537270069, + "kl": 0.0017949468601727858, + "learning_rate": 1.7173333333333333e-06, + "loss": 0.0001, + "num_tokens": 1434829.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.011904762126505375, + "clip_ratio/low_min": 0.011904762126505375, + "clip_ratio/region_mean": 0.011904762126505375, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 89.81481481481481, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.479868412017822, + "kl": 0.2453064126893878, + "learning_rate": 1.717e-06, + "loss": 0.2103, + "num_tokens": 1435117.0, + "reward": 5.875, + "reward_std": 3.25, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 3.25, + "step": 4850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 89.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10015010088682175, + "kl": 0.003540456644259393, + "learning_rate": 1.7166666666666666e-06, + "loss": 0.0002, + "num_tokens": 1435351.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 89.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6972345113754272, + "kl": 0.04174995399080217, + "learning_rate": 1.7163333333333334e-06, + "loss": 0.0021, + "num_tokens": 1435680.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 89.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07457930594682693, + "kl": 0.01834350824356079, + "learning_rate": 1.716e-06, + "loss": 0.0009, + "num_tokens": 1436002.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 89.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17040890455245972, + "kl": 0.020668487064540386, + "learning_rate": 1.7156666666666667e-06, + "loss": 0.0011, + "num_tokens": 1436346.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 89.9074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0210583209991455, + "kl": 0.10992814600467682, + "learning_rate": 1.7153333333333335e-06, + "loss": 0.0474, + "num_tokens": 1436679.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 4855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 89.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18218640983104706, + "kl": 0.011712106177583337, + "learning_rate": 1.715e-06, + "loss": 0.0006, + "num_tokens": 1436983.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 89.94444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.696809768676758, + "kl": 0.05960363708436489, + "learning_rate": 1.7146666666666669e-06, + "loss": -0.02, + "num_tokens": 1437264.0, + "reward": 7.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.875, + "rewards/reward_combined/std": 0.25, + "step": 4857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 89.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.515857994556427, + "kl": 0.0425142552703619, + "learning_rate": 1.7143333333333332e-06, + "loss": 0.0022, + "num_tokens": 1437584.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 89.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10030697286128998, + "kl": 0.026347876526415348, + "learning_rate": 1.714e-06, + "loss": 0.0015, + "num_tokens": 1437888.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 90.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.415035367012024, + "kl": 0.12497110664844513, + "learning_rate": 1.7136666666666666e-06, + "loss": -0.0147, + "num_tokens": 1438258.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 4860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.75, + "completions/mean_terminated_length": 10.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 90.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.38684168457984924, + "kl": 0.026495108380913734, + "learning_rate": 1.7133333333333334e-06, + "loss": 0.0013, + "num_tokens": 1438529.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 90.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00693944375962019, + "kl": 0.001373795501422137, + "learning_rate": 1.713e-06, + "loss": 0.0001, + "num_tokens": 1438806.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 90.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054273296147584915, + "kl": 0.010294564999639988, + "learning_rate": 1.7126666666666667e-06, + "loss": 0.0005, + "num_tokens": 1439126.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 90.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05258876457810402, + "kl": 0.01519824587740004, + "learning_rate": 1.7123333333333335e-06, + "loss": 0.0007, + "num_tokens": 1439448.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.014285714365541935, + "clip_ratio/low_min": 0.014285714365541935, + "clip_ratio/region_mean": 0.014285714365541935, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 90.0925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7553659677505493, + "kl": 0.07879587262868881, + "learning_rate": 1.712e-06, + "loss": 0.0109, + "num_tokens": 1439749.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 4865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 90.11111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9674148559570312, + "kl": 0.11636403948068619, + "learning_rate": 1.7116666666666669e-06, + "loss": 0.0057, + "num_tokens": 1440121.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 4866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 28.75, + "completions/mean_terminated_length": 28.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 90.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10132623463869095, + "kl": 0.019033951684832573, + "learning_rate": 1.7113333333333332e-06, + "loss": 0.001, + "num_tokens": 1440464.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 90.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03728455305099487, + "kl": 0.0023740706965327263, + "learning_rate": 1.7110000000000002e-06, + "loss": 0.0001, + "num_tokens": 1440699.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 90.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025413207709789276, + "kl": 0.004943513544276357, + "learning_rate": 1.7106666666666666e-06, + "loss": 0.0002, + "num_tokens": 1440988.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 2.5, + "completions/mean_terminated_length": 2.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 90.18518518518519, + "frac_reward_zero_std": 0.0, + "grad_norm": 19.713300704956055, + "kl": 0.07313615083694458, + "learning_rate": 1.7103333333333334e-06, + "loss": 0.1707, + "num_tokens": 1441202.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 4870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 90.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07446187734603882, + "kl": 0.013546426314860582, + "learning_rate": 1.71e-06, + "loss": 0.0007, + "num_tokens": 1441536.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 90.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008628357201814651, + "kl": 0.0004967711865901947, + "learning_rate": 1.7096666666666667e-06, + "loss": 0.0, + "num_tokens": 1441796.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 90.24074074074075, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7763540744781494, + "kl": 0.021116905263625085, + "learning_rate": 1.7093333333333335e-06, + "loss": 0.0005, + "num_tokens": 1442122.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 4873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 90.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00874437764286995, + "kl": 0.014907174278050661, + "learning_rate": 1.709e-06, + "loss": 0.0007, + "num_tokens": 1442382.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 90.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2355698198080063, + "kl": 0.02511242777109146, + "learning_rate": 1.7086666666666669e-06, + "loss": 0.0013, + "num_tokens": 1442642.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 90.29629629629629, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.532217979431152, + "kl": 0.02727799816057086, + "learning_rate": 1.7083333333333332e-06, + "loss": 0.1836, + "num_tokens": 1442926.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 4876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 90.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022559387609362602, + "kl": 0.04269120469689369, + "learning_rate": 1.7080000000000002e-06, + "loss": 0.0021, + "num_tokens": 1443330.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 4877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 90.33333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.765388011932373, + "kl": 0.0718580037355423, + "learning_rate": 1.7076666666666666e-06, + "loss": 0.1521, + "num_tokens": 1443656.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 4878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 41.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 41.25, + "completions/mean_terminated_length": 41.25, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 90.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04293379187583923, + "kl": 0.027146708220243454, + "learning_rate": 1.7073333333333333e-06, + "loss": 0.0014, + "num_tokens": 1444045.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 90.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010079680010676384, + "kl": 0.00021346905850805342, + "learning_rate": 1.707e-06, + "loss": 0.0, + "num_tokens": 1444315.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 90.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05477239191532135, + "kl": 0.00454887276282534, + "learning_rate": 1.7066666666666667e-06, + "loss": 0.0002, + "num_tokens": 1444583.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 90.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01082111056894064, + "kl": 0.16325707733631134, + "learning_rate": 1.7063333333333335e-06, + "loss": 0.0082, + "num_tokens": 1444891.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 90.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002076560485875234, + "kl": 5.111098289489746e-06, + "learning_rate": 1.706e-06, + "loss": 0.0, + "num_tokens": 1445111.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 4883 + }, + { + "clip_ratio/high_max": 0.006097560748457909, + "clip_ratio/high_mean": 0.006097560748457909, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006097560748457909, + "completion_length": 40.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 45.0, + "completions/max_terminated_length": 45.0, + "completions/mean_length": 40.25, + "completions/mean_terminated_length": 40.25, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 90.44444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0956783294677734, + "kl": 0.05870269238948822, + "learning_rate": 1.7056666666666668e-06, + "loss": 0.0503, + "num_tokens": 1445488.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 4884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 90.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1301628053188324, + "kl": 0.015491341473534703, + "learning_rate": 1.7053333333333332e-06, + "loss": 0.0008, + "num_tokens": 1445779.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 90.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03917824104428291, + "kl": 0.0007429122815665323, + "learning_rate": 1.7050000000000002e-06, + "loss": 0.0, + "num_tokens": 1446035.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 90.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017919491976499557, + "kl": 0.004026137758046389, + "learning_rate": 1.7046666666666666e-06, + "loss": 0.0002, + "num_tokens": 1446303.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 90.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009305633720941842, + "kl": 0.0037599578499794006, + "learning_rate": 1.7043333333333333e-06, + "loss": 0.0002, + "num_tokens": 1446539.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 90.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013986870646476746, + "kl": 0.0006524303462356329, + "learning_rate": 1.704e-06, + "loss": 0.0, + "num_tokens": 1446865.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 90.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02480613999068737, + "kl": 0.0013298190315254033, + "learning_rate": 1.7036666666666667e-06, + "loss": 0.0001, + "num_tokens": 1447143.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 90.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01889585331082344, + "kl": 0.004025566508062184, + "learning_rate": 1.7033333333333335e-06, + "loss": 0.0002, + "num_tokens": 1447435.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 90.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06282459199428558, + "kl": 0.0034190231235697865, + "learning_rate": 1.703e-06, + "loss": 0.0002, + "num_tokens": 1447735.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 24.25, + "completions/mean_terminated_length": 24.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 90.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.1463141441345215, + "kl": 0.08878976106643677, + "learning_rate": 1.7026666666666668e-06, + "loss": 0.2138, + "num_tokens": 1448056.0, + "reward": 6.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 6.5, + "rewards/reward_combined/std": 2.0, + "step": 4893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 90.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007869921624660492, + "kl": 0.0022259624674916267, + "learning_rate": 1.7023333333333334e-06, + "loss": 0.0001, + "num_tokens": 1448368.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 26.75, + "completions/mean_terminated_length": 26.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 90.64814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.923832654953003, + "kl": 0.06465357914566994, + "learning_rate": 1.7020000000000002e-06, + "loss": 0.115, + "num_tokens": 1448727.0, + "reward": 4.125, + "reward_std": 3.902456521987915, + "rewards/reward_combined/mean": 4.125, + "rewards/reward_combined/std": 3.902456521987915, + "step": 4895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 90.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015185699798166752, + "kl": 0.0004437565803527832, + "learning_rate": 1.7016666666666665e-06, + "loss": 0.0, + "num_tokens": 1448935.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 90.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004129840526729822, + "kl": 0.0015180340269580483, + "learning_rate": 1.7013333333333333e-06, + "loss": 0.0001, + "num_tokens": 1449219.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 90.70370370370371, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.470602989196777, + "kl": 0.009461591951549053, + "learning_rate": 1.7009999999999999e-06, + "loss": 0.2991, + "num_tokens": 1449451.0, + "reward": 2.75, + "reward_std": 2.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 2.5, + "step": 4898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 90.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09812650829553604, + "kl": 0.030571318231523037, + "learning_rate": 1.7006666666666667e-06, + "loss": 0.0015, + "num_tokens": 1449726.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 90.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009352663531899452, + "kl": 0.009455515537410975, + "learning_rate": 1.7003333333333335e-06, + "loss": 0.0005, + "num_tokens": 1449998.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 90.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040579233318567276, + "kl": 0.007567316293716431, + "learning_rate": 1.7e-06, + "loss": 0.0004, + "num_tokens": 1450302.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 32.0, + "completions/mean_terminated_length": 32.0, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 90.77777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.976142406463623, + "kl": 0.12730075418949127, + "learning_rate": 1.6996666666666668e-06, + "loss": -0.0796, + "num_tokens": 1450658.0, + "reward": 7.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.875, + "rewards/reward_combined/std": 0.25, + "step": 4902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 90.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10649600625038147, + "kl": 0.014784622006118298, + "learning_rate": 1.6993333333333334e-06, + "loss": 0.0007, + "num_tokens": 1450942.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 90.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5189959406852722, + "kl": 0.05209812615066767, + "learning_rate": 1.6990000000000002e-06, + "loss": 0.0033, + "num_tokens": 1451216.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 90.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.41963323950767517, + "kl": 0.06770718470215797, + "learning_rate": 1.6986666666666665e-06, + "loss": 0.0036, + "num_tokens": 1451504.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 90.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021894490346312523, + "kl": 0.0008557852415833622, + "learning_rate": 1.6983333333333333e-06, + "loss": 0.0, + "num_tokens": 1451815.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 90.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0849485993385315, + "kl": 0.0034385338658466935, + "learning_rate": 1.6979999999999999e-06, + "loss": 0.0002, + "num_tokens": 1452034.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 90.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016248758882284164, + "kl": 0.2645472288131714, + "learning_rate": 1.6976666666666667e-06, + "loss": 0.0132, + "num_tokens": 1452339.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 90.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04291994124650955, + "kl": 0.023891227785497904, + "learning_rate": 1.6973333333333334e-06, + "loss": 0.0012, + "num_tokens": 1452628.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 90.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006555619183927774, + "kl": 0.0007132887840270996, + "learning_rate": 1.697e-06, + "loss": 0.0, + "num_tokens": 1452888.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 27.25, + "completions/mean_terminated_length": 27.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 90.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11527550965547562, + "kl": 0.037596405716612935, + "learning_rate": 1.6966666666666668e-06, + "loss": 0.0019, + "num_tokens": 1453217.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 90.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022737320512533188, + "kl": 0.05129823461174965, + "learning_rate": 1.6963333333333334e-06, + "loss": 0.0026, + "num_tokens": 1453553.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 90.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014274314744397998, + "kl": 9.586289525032043e-05, + "learning_rate": 1.6960000000000002e-06, + "loss": 0.0, + "num_tokens": 1453797.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 91.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02939451113343239, + "kl": 0.00025314688673461205, + "learning_rate": 1.6956666666666665e-06, + "loss": 0.0, + "num_tokens": 1454010.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 91.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010532417334616184, + "kl": 0.003958088462240994, + "learning_rate": 1.6953333333333335e-06, + "loss": 0.0002, + "num_tokens": 1454278.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 91.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007757003186270595, + "kl": 0.0037833750247955322, + "learning_rate": 1.6949999999999999e-06, + "loss": 0.0002, + "num_tokens": 1454514.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 91.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07356951385736465, + "kl": 0.022042195312678814, + "learning_rate": 1.6946666666666666e-06, + "loss": 0.001, + "num_tokens": 1454858.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 91.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033774372190237045, + "kl": 0.002994000678882003, + "learning_rate": 1.6943333333333334e-06, + "loss": 0.0001, + "num_tokens": 1455128.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.5, + "completions/mean_terminated_length": 4.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 91.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10086127370595932, + "kl": 0.0036579921725206077, + "learning_rate": 1.694e-06, + "loss": 0.0002, + "num_tokens": 1455346.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 73.75, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 73.75, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 91.11111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7861106395721436, + "kl": 0.0038848338299430907, + "learning_rate": 1.6936666666666668e-06, + "loss": 0.4534, + "num_tokens": 1455873.0, + "reward": 5.300000190734863, + "reward_std": 4.400000095367432, + "rewards/reward_combined/mean": 5.300000190734863, + "rewards/reward_combined/std": 4.400000095367432, + "step": 4920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 33.0, + "completions/mean_terminated_length": 33.0, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 91.12962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3681552410125732, + "kl": 0.335488423705101, + "learning_rate": 1.6933333333333334e-06, + "loss": 0.1304, + "num_tokens": 1456241.0, + "reward": 4.75, + "reward_std": 3.4034295082092285, + "rewards/reward_combined/mean": 4.75, + "rewards/reward_combined/std": 3.4034297466278076, + "step": 4921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 91.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04183509573340416, + "kl": 0.008128313114866614, + "learning_rate": 1.6930000000000001e-06, + "loss": 0.0004, + "num_tokens": 1456566.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 32.0, + "completions/mean_terminated_length": 32.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 91.16666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.6732306480407715, + "kl": 0.1125209890305996, + "learning_rate": 1.6926666666666665e-06, + "loss": 0.0594, + "num_tokens": 1456910.0, + "reward": 4.625, + "reward_std": 4.308422088623047, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 4.308422088623047, + "step": 4923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 91.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006506770849227905, + "kl": 0.0006705879932269454, + "learning_rate": 1.6923333333333335e-06, + "loss": 0.0, + "num_tokens": 1457170.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 91.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02825717069208622, + "kl": 0.001499679303378798, + "learning_rate": 1.6919999999999999e-06, + "loss": 0.0001, + "num_tokens": 1457481.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 91.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009554890915751457, + "kl": 0.00931511353701353, + "learning_rate": 1.6916666666666666e-06, + "loss": 0.0005, + "num_tokens": 1457753.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 91.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00010792806278914213, + "kl": 3.1888484954833984e-06, + "learning_rate": 1.6913333333333334e-06, + "loss": 0.0, + "num_tokens": 1457973.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 4927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 91.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031192228198051453, + "kl": 0.004844009876251221, + "learning_rate": 1.691e-06, + "loss": 0.0002, + "num_tokens": 1458185.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 91.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0283152237534523, + "kl": 0.0005697508458979428, + "learning_rate": 1.6906666666666668e-06, + "loss": 0.0, + "num_tokens": 1458397.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 91.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047497108578681946, + "kl": 0.0030733130406588316, + "learning_rate": 1.6903333333333333e-06, + "loss": 0.0002, + "num_tokens": 1458692.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 91.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022882595658302307, + "kl": 0.0008376652549486607, + "learning_rate": 1.6900000000000001e-06, + "loss": 0.0, + "num_tokens": 1458970.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 91.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009101000614464283, + "kl": 0.0018309559673070908, + "learning_rate": 1.6896666666666665e-06, + "loss": 0.0001, + "num_tokens": 1459282.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 91.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018523814156651497, + "kl": 0.0007301649311557412, + "learning_rate": 1.6893333333333335e-06, + "loss": 0.0, + "num_tokens": 1459602.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 91.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11225615441799164, + "kl": 0.016585070174187422, + "learning_rate": 1.6889999999999998e-06, + "loss": 0.0008, + "num_tokens": 1459910.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 91.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05018152669072151, + "kl": 0.009000246413052082, + "learning_rate": 1.6886666666666666e-06, + "loss": 0.0004, + "num_tokens": 1460243.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 91.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012371880933642387, + "kl": 0.00013869106987840496, + "learning_rate": 1.6883333333333334e-06, + "loss": 0.0, + "num_tokens": 1460499.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 91.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08162127435207367, + "kl": 0.007962659932672977, + "learning_rate": 1.688e-06, + "loss": 0.0004, + "num_tokens": 1460792.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 91.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005772865377366543, + "kl": 0.00034815073013305664, + "learning_rate": 1.6876666666666668e-06, + "loss": 0.0, + "num_tokens": 1461052.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 91.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03673241659998894, + "kl": 0.002948103239759803, + "learning_rate": 1.6873333333333333e-06, + "loss": 0.0002, + "num_tokens": 1461334.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 91.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008549237623810768, + "kl": 0.014917390421032906, + "learning_rate": 1.6870000000000001e-06, + "loss": 0.0007, + "num_tokens": 1461594.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.75, + "completions/mean_terminated_length": 10.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 91.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12307348847389221, + "kl": 0.01311293535400182, + "learning_rate": 1.6866666666666667e-06, + "loss": 0.0007, + "num_tokens": 1461865.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 91.51851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.816999912261963, + "kl": 0.09994024876505136, + "learning_rate": 1.6863333333333335e-06, + "loss": -0.0093, + "num_tokens": 1462154.0, + "reward": 6.75, + "reward_std": 2.5, + "rewards/reward_combined/mean": 6.75, + "rewards/reward_combined/std": 2.5, + "step": 4942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 91.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020536771044135094, + "kl": 0.0005666979268426076, + "learning_rate": 1.6860000000000002e-06, + "loss": 0.0, + "num_tokens": 1462450.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 91.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06226768344640732, + "kl": 0.017683228012174368, + "learning_rate": 1.6856666666666666e-06, + "loss": 0.0008, + "num_tokens": 1462777.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 91.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05985826253890991, + "kl": 0.1075504794716835, + "learning_rate": 1.6853333333333336e-06, + "loss": 0.0054, + "num_tokens": 1463149.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 32.5, + "completions/mean_terminated_length": 32.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 91.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07356952875852585, + "kl": 0.016571279615163803, + "learning_rate": 1.685e-06, + "loss": 0.0008, + "num_tokens": 1463499.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 91.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07550165057182312, + "kl": 0.015795729123055935, + "learning_rate": 1.6846666666666667e-06, + "loss": 0.0008, + "num_tokens": 1463783.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 91.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3067377507686615, + "kl": 0.031524146907031536, + "learning_rate": 1.6843333333333333e-06, + "loss": 0.0017, + "num_tokens": 1464089.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 91.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07108604907989502, + "kl": 0.00878575723618269, + "learning_rate": 1.684e-06, + "loss": 0.0004, + "num_tokens": 1464349.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 91.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006079776678234339, + "kl": 0.0016200095415115356, + "learning_rate": 1.6836666666666667e-06, + "loss": 0.0001, + "num_tokens": 1464565.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 91.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017288338392972946, + "kl": 0.0006929486989974976, + "learning_rate": 1.6833333333333335e-06, + "loss": 0.0, + "num_tokens": 1464777.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 91.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028068071231245995, + "kl": 0.002798804547637701, + "learning_rate": 1.6830000000000002e-06, + "loss": 0.0001, + "num_tokens": 1465037.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.0, + "completions/max_terminated_length": 46.0, + "completions/mean_length": 37.0, + "completions/mean_terminated_length": 37.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 91.72222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.951070547103882, + "kl": 0.07814485020935535, + "learning_rate": 1.6826666666666666e-06, + "loss": 0.1023, + "num_tokens": 1465409.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 4953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 91.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0625244528055191, + "kl": 0.010406154673546553, + "learning_rate": 1.6823333333333336e-06, + "loss": 0.0005, + "num_tokens": 1465745.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 36.5, + "completions/mean_terminated_length": 36.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 91.75925925925925, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.840162992477417, + "kl": 0.12812123447656631, + "learning_rate": 1.682e-06, + "loss": -0.0599, + "num_tokens": 1466119.0, + "reward": 6.75, + "reward_std": 2.5, + "rewards/reward_combined/mean": 6.75, + "rewards/reward_combined/std": 2.5, + "step": 4955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 91.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00025614103651605546, + "kl": 7.656589150428772e-05, + "learning_rate": 1.6816666666666667e-06, + "loss": 0.0, + "num_tokens": 1466363.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 91.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16323304176330566, + "kl": 0.02387662325054407, + "learning_rate": 1.6813333333333333e-06, + "loss": 0.0013, + "num_tokens": 1466663.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 91.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2361888438463211, + "kl": 0.05485841631889343, + "learning_rate": 1.681e-06, + "loss": 0.0028, + "num_tokens": 1466984.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 91.83333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.013523578643799, + "kl": 0.03875024616718292, + "learning_rate": 1.6806666666666667e-06, + "loss": -0.0101, + "num_tokens": 1467290.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 4959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 91.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09304482489824295, + "kl": 0.006097201490774751, + "learning_rate": 1.6803333333333334e-06, + "loss": 0.0003, + "num_tokens": 1467554.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 91.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04290613904595375, + "kl": 0.16057635843753815, + "learning_rate": 1.6800000000000002e-06, + "loss": 0.008, + "num_tokens": 1467864.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 91.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11511863023042679, + "kl": 0.04005991294980049, + "learning_rate": 1.6796666666666666e-06, + "loss": 0.002, + "num_tokens": 1468133.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 91.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008777844719588757, + "kl": 0.00017987936735153198, + "learning_rate": 1.6793333333333336e-06, + "loss": 0.0, + "num_tokens": 1468403.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 31.5, + "completions/mean_terminated_length": 31.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 91.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10896319150924683, + "kl": 0.025346003472805023, + "learning_rate": 1.679e-06, + "loss": 0.0013, + "num_tokens": 1468757.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 91.94444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.057124137878418, + "kl": 0.005520134000107646, + "learning_rate": 1.6786666666666667e-06, + "loss": 0.0006, + "num_tokens": 1469047.0, + "reward": 5.75, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 4.5, + "step": 4965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 91.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03335168585181236, + "kl": 0.0008818720234557986, + "learning_rate": 1.6783333333333333e-06, + "loss": 0.0, + "num_tokens": 1469282.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 34.75, + "completions/mean_terminated_length": 34.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 91.98148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7279026508331299, + "kl": 0.052693258970975876, + "learning_rate": 1.678e-06, + "loss": 0.0538, + "num_tokens": 1469701.0, + "reward": 2.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 0.25, + "step": 4967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 92.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003178240731358528, + "kl": 0.2684226781129837, + "learning_rate": 1.6776666666666666e-06, + "loss": 0.0134, + "num_tokens": 1470005.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 92.01851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.807964324951172, + "kl": 0.027837354689836502, + "learning_rate": 1.6773333333333334e-06, + "loss": 0.0666, + "num_tokens": 1470296.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 4969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 5.5, + "completions/mean_terminated_length": 5.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 92.03703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 17.87394905090332, + "kl": 0.00461736461147666, + "learning_rate": 1.6770000000000002e-06, + "loss": 0.1489, + "num_tokens": 1470514.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 4970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 92.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07200003415346146, + "kl": 0.008428729604929686, + "learning_rate": 1.6766666666666668e-06, + "loss": 0.0004, + "num_tokens": 1470839.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 92.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07832275331020355, + "kl": 0.008064561057835817, + "learning_rate": 1.6763333333333336e-06, + "loss": 0.0004, + "num_tokens": 1471128.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 92.0925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 11.649913787841797, + "kl": 0.0860157017596066, + "learning_rate": 1.676e-06, + "loss": -0.2037, + "num_tokens": 1471406.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 4973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 92.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14575444161891937, + "kl": 0.01035630349360872, + "learning_rate": 1.6756666666666667e-06, + "loss": 0.0005, + "num_tokens": 1471662.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 92.12962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.772905111312866, + "kl": 0.02095145918428898, + "learning_rate": 1.6753333333333333e-06, + "loss": -0.0029, + "num_tokens": 1471954.0, + "reward": 5.75, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 4.5, + "step": 4975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.013888888992369175, + "clip_ratio/low_min": 0.013888888992369175, + "clip_ratio/region_mean": 0.013888888992369175, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 92.14814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8809361457824707, + "kl": 0.586449109017849, + "learning_rate": 1.675e-06, + "loss": -0.0046, + "num_tokens": 1472255.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 4976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 92.16666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 11.035835266113281, + "kl": 0.06366511806845665, + "learning_rate": 1.6746666666666666e-06, + "loss": -0.0861, + "num_tokens": 1472465.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 4977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 92.18518518518519, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.383103370666504, + "kl": 0.052995434030890465, + "learning_rate": 1.6743333333333334e-06, + "loss": 0.0419, + "num_tokens": 1472813.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 4978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 92.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08518697321414948, + "kl": 0.002585767302662134, + "learning_rate": 1.6740000000000002e-06, + "loss": 0.0001, + "num_tokens": 1473109.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 92.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10820410400629044, + "kl": 0.021352801471948624, + "learning_rate": 1.6736666666666668e-06, + "loss": 0.0011, + "num_tokens": 1473421.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 34.0, + "completions/mean_terminated_length": 34.0, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 92.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19683966040611267, + "kl": 0.05924472399055958, + "learning_rate": 1.6733333333333335e-06, + "loss": 0.0029, + "num_tokens": 1473837.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 4981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 92.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025471318513154984, + "kl": 0.0008457452058792114, + "learning_rate": 1.673e-06, + "loss": 0.0, + "num_tokens": 1474097.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 92.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015883689746260643, + "kl": 0.0034166210098192096, + "learning_rate": 1.6726666666666667e-06, + "loss": 0.0002, + "num_tokens": 1474365.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 92.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007451504934579134, + "kl": 0.00198943167924881, + "learning_rate": 1.6723333333333333e-06, + "loss": 0.0001, + "num_tokens": 1474581.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 27.25, + "completions/mean_terminated_length": 27.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 92.31481481481481, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.134199142456055, + "kl": 0.08974160626530647, + "learning_rate": 1.672e-06, + "loss": 0.1165, + "num_tokens": 1474926.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 4985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 92.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038822464644908905, + "kl": 0.0992378257215023, + "learning_rate": 1.6716666666666666e-06, + "loss": 0.005, + "num_tokens": 1475298.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 92.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05400337278842926, + "kl": 0.02195595996454358, + "learning_rate": 1.6713333333333334e-06, + "loss": 0.0011, + "num_tokens": 1475569.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 92.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037973422557115555, + "kl": 0.004420760742505081, + "learning_rate": 1.6710000000000002e-06, + "loss": 0.0002, + "num_tokens": 1475829.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 92.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.3632804843364283e-05, + "kl": 2.8908252716064453e-06, + "learning_rate": 1.6706666666666668e-06, + "loss": 0.0, + "num_tokens": 1476049.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 4989 + }, + { + "clip_ratio/high_max": 0.009615384973585606, + "clip_ratio/high_mean": 0.009615384973585606, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009615384973585606, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 92.4074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.129278182983398, + "kl": 0.04073519539088011, + "learning_rate": 1.6703333333333335e-06, + "loss": 0.1077, + "num_tokens": 1476357.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 4990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 92.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006086123757995665, + "kl": 0.0038093402981758118, + "learning_rate": 1.6699999999999999e-06, + "loss": 0.0002, + "num_tokens": 1476593.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 92.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06056710705161095, + "kl": 0.03574089426547289, + "learning_rate": 1.6696666666666669e-06, + "loss": 0.0018, + "num_tokens": 1476896.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 92.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03683551028370857, + "kl": 0.0018821939593181014, + "learning_rate": 1.6693333333333332e-06, + "loss": 0.0001, + "num_tokens": 1477115.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 4993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 92.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03492877632379532, + "kl": 0.0024379646638408303, + "learning_rate": 1.669e-06, + "loss": 0.0001, + "num_tokens": 1477391.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 92.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06660422682762146, + "kl": 0.012176299467682838, + "learning_rate": 1.6686666666666666e-06, + "loss": 0.0006, + "num_tokens": 1477729.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 4995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 92.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02502434141933918, + "kl": 0.0010655286605469882, + "learning_rate": 1.6683333333333334e-06, + "loss": 0.0001, + "num_tokens": 1478050.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 92.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06613127142190933, + "kl": 0.004655712749809027, + "learning_rate": 1.6680000000000002e-06, + "loss": 0.0002, + "num_tokens": 1478304.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 4997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 92.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01316966861486435, + "kl": 0.005927043997871806, + "learning_rate": 1.6676666666666667e-06, + "loss": 0.0003, + "num_tokens": 1478574.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 4998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 35.25, + "completions/mean_terminated_length": 35.25, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 92.57407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2286741733551025, + "kl": 0.1367866089567542, + "learning_rate": 1.6673333333333335e-06, + "loss": 0.0673, + "num_tokens": 1478939.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 4999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 92.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3375140130519867, + "kl": 0.048935662023723125, + "learning_rate": 1.6669999999999999e-06, + "loss": 0.0035, + "num_tokens": 1479222.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 92.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0032447674311697483, + "kl": 0.26840534806251526, + "learning_rate": 1.6666666666666669e-06, + "loss": 0.0134, + "num_tokens": 1479526.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 92.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13131465017795563, + "kl": 0.01511222030967474, + "learning_rate": 1.6663333333333332e-06, + "loss": 0.0008, + "num_tokens": 1479871.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 92.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03960214927792549, + "kl": 0.0015117888106033206, + "learning_rate": 1.666e-06, + "loss": 0.0001, + "num_tokens": 1480135.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 92.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2215038537979126, + "kl": 0.022577311377972364, + "learning_rate": 1.6656666666666666e-06, + "loss": 0.0011, + "num_tokens": 1480422.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 92.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05715415999293327, + "kl": 0.00910698575899005, + "learning_rate": 1.6653333333333334e-06, + "loss": 0.0005, + "num_tokens": 1480704.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 34.0, + "completions/mean_terminated_length": 34.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 92.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09245792776346207, + "kl": 0.04517386294901371, + "learning_rate": 1.6650000000000002e-06, + "loss": 0.0022, + "num_tokens": 1481056.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 31.5, + "completions/mean_terminated_length": 31.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 92.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12053433060646057, + "kl": 0.027421538718044758, + "learning_rate": 1.6646666666666667e-06, + "loss": 0.0014, + "num_tokens": 1481410.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 92.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0312466062605381, + "kl": 0.0008065802976489067, + "learning_rate": 1.6643333333333335e-06, + "loss": 0.0, + "num_tokens": 1481726.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 92.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028037171810865402, + "kl": 0.0013543638633564115, + "learning_rate": 1.6639999999999999e-06, + "loss": 0.0001, + "num_tokens": 1481998.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 37.0, + "completions/mean_terminated_length": 37.0, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 92.77777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1913347244262695, + "kl": 0.06364433281123638, + "learning_rate": 1.6636666666666669e-06, + "loss": 0.0175, + "num_tokens": 1482374.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 5010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 92.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013355693779885769, + "kl": 0.0016948528354987502, + "learning_rate": 1.6633333333333332e-06, + "loss": 0.0001, + "num_tokens": 1482651.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 92.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0109371617436409, + "kl": 0.014523950405418873, + "learning_rate": 1.663e-06, + "loss": 0.0007, + "num_tokens": 1482911.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5012 + }, + { + "clip_ratio/high_max": 0.011363636702299118, + "clip_ratio/high_mean": 0.011363636702299118, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.011363636702299118, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 92.83333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.241844177246094, + "kl": 0.02016565576195717, + "learning_rate": 1.6626666666666666e-06, + "loss": -0.0023, + "num_tokens": 1483239.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 5013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 92.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04952339082956314, + "kl": 0.006025760900229216, + "learning_rate": 1.6623333333333334e-06, + "loss": 0.0003, + "num_tokens": 1483541.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 92.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04403427988290787, + "kl": 0.1632367968559265, + "learning_rate": 1.6620000000000001e-06, + "loss": 0.0082, + "num_tokens": 1483850.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 92.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003072462510317564, + "kl": 0.00010261541319778189, + "learning_rate": 1.6616666666666667e-06, + "loss": 0.0, + "num_tokens": 1484104.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 92.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03218698874115944, + "kl": 0.004485756158828735, + "learning_rate": 1.6613333333333335e-06, + "loss": 0.0002, + "num_tokens": 1484316.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 92.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07350842654705048, + "kl": 0.015073230490088463, + "learning_rate": 1.661e-06, + "loss": 0.0008, + "num_tokens": 1484608.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 92.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041076578199863434, + "kl": 0.003065375378355384, + "learning_rate": 1.6606666666666668e-06, + "loss": 0.0002, + "num_tokens": 1484900.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 92.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007892746478319168, + "kl": 0.010176368523389101, + "learning_rate": 1.6603333333333332e-06, + "loss": 0.0005, + "num_tokens": 1485172.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 6.0, + "completions/mean_terminated_length": 6.0, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 92.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15762390196323395, + "kl": 0.005376547574996948, + "learning_rate": 1.66e-06, + "loss": 0.0003, + "num_tokens": 1485404.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 93.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03976396843791008, + "kl": 0.011428375728428364, + "learning_rate": 1.6596666666666666e-06, + "loss": 0.0006, + "num_tokens": 1485688.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 27.0, + "completions/mean_terminated_length": 27.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 93.01851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.26559591293335, + "kl": 0.06295810453593731, + "learning_rate": 1.6593333333333333e-06, + "loss": 0.0151, + "num_tokens": 1486048.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 5023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 93.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.27831727266311646, + "kl": 0.031008249148726463, + "learning_rate": 1.6590000000000001e-06, + "loss": 0.0016, + "num_tokens": 1486334.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 93.05555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.607852935791016, + "kl": 0.050431785377440974, + "learning_rate": 1.6586666666666667e-06, + "loss": 0.0515, + "num_tokens": 1486621.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 5025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 93.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03103502467274666, + "kl": 0.0013500666827894747, + "learning_rate": 1.6583333333333335e-06, + "loss": 0.0001, + "num_tokens": 1486940.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 93.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017780333757400513, + "kl": 0.0492660328745842, + "learning_rate": 1.658e-06, + "loss": 0.0025, + "num_tokens": 1487272.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 8.25, + "completions/mean_terminated_length": 8.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 93.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.433762788772583, + "kl": 0.043206318136071786, + "learning_rate": 1.6576666666666668e-06, + "loss": 0.0029, + "num_tokens": 1487513.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 93.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05056290328502655, + "kl": 0.004398422548547387, + "learning_rate": 1.6573333333333332e-06, + "loss": 0.0002, + "num_tokens": 1487801.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 93.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013421811163425446, + "kl": 4.9064554332289845e-05, + "learning_rate": 1.657e-06, + "loss": 0.0, + "num_tokens": 1488073.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 93.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031043440103530884, + "kl": 0.00475098192691803, + "learning_rate": 1.6566666666666665e-06, + "loss": 0.0002, + "num_tokens": 1488285.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 93.18518518518519, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1078578233718872, + "kl": 0.47988858609460294, + "learning_rate": 1.6563333333333333e-06, + "loss": 0.0241, + "num_tokens": 1488567.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 5032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 93.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008925163419917226, + "kl": 0.00011397525668144226, + "learning_rate": 1.6560000000000001e-06, + "loss": 0.0, + "num_tokens": 1488811.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 93.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08188676834106445, + "kl": 0.009675647597759962, + "learning_rate": 1.6556666666666667e-06, + "loss": 0.0005, + "num_tokens": 1489115.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 93.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011076646856963634, + "kl": 0.014485355466604233, + "learning_rate": 1.6553333333333335e-06, + "loss": 0.0007, + "num_tokens": 1489375.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 93.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05944862961769104, + "kl": 0.00756123336032033, + "learning_rate": 1.655e-06, + "loss": 0.0004, + "num_tokens": 1489714.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 93.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024482984095811844, + "kl": 0.001643153140321374, + "learning_rate": 1.6546666666666668e-06, + "loss": 0.0001, + "num_tokens": 1489994.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.25, + "completions/mean_terminated_length": 5.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 93.29629629629629, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.84996509552002, + "kl": 0.019488862904836424, + "learning_rate": 1.6543333333333332e-06, + "loss": 0.1299, + "num_tokens": 1490215.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 5038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 93.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11664830148220062, + "kl": 0.04044315405189991, + "learning_rate": 1.6540000000000002e-06, + "loss": 0.002, + "num_tokens": 1490517.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 37.5, + "completions/mean_terminated_length": 37.5, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 93.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052643537521362305, + "kl": 0.06707258895039558, + "learning_rate": 1.6536666666666665e-06, + "loss": 0.0034, + "num_tokens": 1490895.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 93.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03135155141353607, + "kl": 0.04444164037704468, + "learning_rate": 1.6533333333333333e-06, + "loss": 0.0022, + "num_tokens": 1491299.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 5041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 2.5, + "completions/mean_terminated_length": 2.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 93.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2690921127796173, + "kl": 0.03421100229024887, + "learning_rate": 1.653e-06, + "loss": 0.0017, + "num_tokens": 1491505.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 27.5, + "completions/mean_terminated_length": 27.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 93.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8561997413635254, + "kl": 0.07261555641889572, + "learning_rate": 1.6526666666666667e-06, + "loss": 0.0041, + "num_tokens": 1491843.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 93.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026965666562318802, + "kl": 0.0013179140514694154, + "learning_rate": 1.6523333333333335e-06, + "loss": 0.0001, + "num_tokens": 1492111.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 93.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09048201888799667, + "kl": 0.0154511583968997, + "learning_rate": 1.652e-06, + "loss": 0.0009, + "num_tokens": 1492390.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 93.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07614016532897949, + "kl": 0.0038604214787483215, + "learning_rate": 1.6516666666666668e-06, + "loss": 0.0002, + "num_tokens": 1492602.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 93.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04271329939365387, + "kl": 0.0010043196380138397, + "learning_rate": 1.6513333333333332e-06, + "loss": 0.0001, + "num_tokens": 1492862.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 93.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01605292037129402, + "kl": 0.00045746201067231596, + "learning_rate": 1.6510000000000002e-06, + "loss": 0.0, + "num_tokens": 1493178.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 93.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003291687462478876, + "kl": 0.26838211715221405, + "learning_rate": 1.6506666666666665e-06, + "loss": 0.0134, + "num_tokens": 1493482.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 93.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028504453599452972, + "kl": 0.0004374742457002867, + "learning_rate": 1.6503333333333333e-06, + "loss": 0.0, + "num_tokens": 1493738.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.5, + "completions/mean_terminated_length": 4.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 93.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.340577483177185, + "kl": 0.10927124321460724, + "learning_rate": 1.65e-06, + "loss": 0.0065, + "num_tokens": 1493956.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 93.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01485071238130331, + "kl": 0.001964425668120384, + "learning_rate": 1.6496666666666667e-06, + "loss": 0.0001, + "num_tokens": 1494268.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 93.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12154436111450195, + "kl": 0.019749329425394535, + "learning_rate": 1.6493333333333334e-06, + "loss": 0.001, + "num_tokens": 1494562.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 93.5925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.277632236480713, + "kl": 0.1807153820991516, + "learning_rate": 1.649e-06, + "loss": -0.0239, + "num_tokens": 1494869.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 5054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 93.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049506545066833496, + "kl": 0.014848444610834122, + "learning_rate": 1.6486666666666668e-06, + "loss": 0.0007, + "num_tokens": 1495166.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 93.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047044917941093445, + "kl": 0.01242524420376867, + "learning_rate": 1.6483333333333332e-06, + "loss": 0.0006, + "num_tokens": 1495488.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 93.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03062351793050766, + "kl": 0.004175095586106181, + "learning_rate": 1.6480000000000001e-06, + "loss": 0.0002, + "num_tokens": 1495786.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 93.66666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.3765788078308105, + "kl": 0.5038115493953228, + "learning_rate": 1.6476666666666665e-06, + "loss": 0.0664, + "num_tokens": 1496116.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 5058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 93.68518518518519, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.166257858276367, + "kl": 0.01836752239614725, + "learning_rate": 1.6473333333333333e-06, + "loss": 0.2176, + "num_tokens": 1496461.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 5059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 93.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00017637595010455698, + "kl": 5.759298801422119e-06, + "learning_rate": 1.647e-06, + "loss": 0.0, + "num_tokens": 1496681.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 5060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 33.75, + "completions/mean_terminated_length": 33.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 93.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05268222838640213, + "kl": 0.019020277075469494, + "learning_rate": 1.6466666666666666e-06, + "loss": 0.001, + "num_tokens": 1497040.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 34.75, + "completions/mean_terminated_length": 34.75, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 93.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.9249795079231262, + "kl": 0.21187730878591537, + "learning_rate": 1.6463333333333334e-06, + "loss": 0.0127, + "num_tokens": 1497447.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 93.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10928282141685486, + "kl": 0.010948408860713243, + "learning_rate": 1.646e-06, + "loss": 0.0005, + "num_tokens": 1497715.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 93.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020056122913956642, + "kl": 0.003073722356930375, + "learning_rate": 1.6456666666666668e-06, + "loss": 0.0002, + "num_tokens": 1498045.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 93.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03642801195383072, + "kl": 0.007461420493200421, + "learning_rate": 1.6453333333333333e-06, + "loss": 0.0004, + "num_tokens": 1498368.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 93.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011308939196169376, + "kl": 0.0013963497476652265, + "learning_rate": 1.6450000000000001e-06, + "loss": 0.0001, + "num_tokens": 1498638.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 93.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03301926702260971, + "kl": 0.003885791782522574, + "learning_rate": 1.644666666666667e-06, + "loss": 0.0002, + "num_tokens": 1498896.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 93.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05412767454981804, + "kl": 0.02548685297369957, + "learning_rate": 1.6443333333333333e-06, + "loss": 0.0013, + "num_tokens": 1499169.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 93.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027327850461006165, + "kl": 0.0021450609201565385, + "learning_rate": 1.6440000000000003e-06, + "loss": 0.0001, + "num_tokens": 1499429.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 24.75, + "completions/mean_terminated_length": 24.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 93.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13876545429229736, + "kl": 0.04095316492021084, + "learning_rate": 1.6436666666666666e-06, + "loss": 0.002, + "num_tokens": 1499748.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 93.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007521340623497963, + "kl": 0.01020409632474184, + "learning_rate": 1.6433333333333334e-06, + "loss": 0.0005, + "num_tokens": 1500020.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 93.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000577855680603534, + "kl": 0.0038079768419265747, + "learning_rate": 1.643e-06, + "loss": 0.0002, + "num_tokens": 1500256.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 36.5, + "completions/mean_terminated_length": 36.5, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 93.94444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3283426761627197, + "kl": 0.053489550948143005, + "learning_rate": 1.6426666666666668e-06, + "loss": -0.034, + "num_tokens": 1500618.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 5073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 93.96296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.780770778656006, + "kl": 0.03146049380302429, + "learning_rate": 1.6423333333333333e-06, + "loss": 0.0773, + "num_tokens": 1500894.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 5074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 93.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026120349764823914, + "kl": 0.007943099364638329, + "learning_rate": 1.6420000000000001e-06, + "loss": 0.0004, + "num_tokens": 1501172.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 94.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0223385076969862, + "kl": 0.0049424098688177764, + "learning_rate": 1.641666666666667e-06, + "loss": 0.0002, + "num_tokens": 1501465.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 94.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1057843565940857, + "kl": 0.006456733332015574, + "learning_rate": 1.6413333333333333e-06, + "loss": 0.0003, + "num_tokens": 1501735.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 94.03703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7795674800872803, + "kl": 0.07165789604187012, + "learning_rate": 1.6410000000000003e-06, + "loss": 0.0025, + "num_tokens": 1502017.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 5078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 94.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06185257434844971, + "kl": 0.00764935789629817, + "learning_rate": 1.6406666666666666e-06, + "loss": 0.0004, + "num_tokens": 1502348.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 94.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08810275048017502, + "kl": 0.00854942761361599, + "learning_rate": 1.6403333333333334e-06, + "loss": 0.0004, + "num_tokens": 1502616.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 94.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20623786747455597, + "kl": 0.031064768321812153, + "learning_rate": 1.64e-06, + "loss": 0.0019, + "num_tokens": 1502900.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 94.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12593761086463928, + "kl": 0.012586106546223164, + "learning_rate": 1.6396666666666668e-06, + "loss": 0.0006, + "num_tokens": 1503195.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 94.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01429614145308733, + "kl": 0.0005785822868347168, + "learning_rate": 1.6393333333333333e-06, + "loss": 0.0, + "num_tokens": 1503407.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 94.14814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.658608913421631, + "kl": 0.0704478845000267, + "learning_rate": 1.639e-06, + "loss": 0.0446, + "num_tokens": 1503725.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 5084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 94.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04747282713651657, + "kl": 0.017318569123744965, + "learning_rate": 1.6386666666666669e-06, + "loss": 0.0009, + "num_tokens": 1504025.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 32.0, + "completions/mean_terminated_length": 32.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 94.18518518518519, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3169313669204712, + "kl": 0.22855431586503983, + "learning_rate": 1.6383333333333332e-06, + "loss": 0.0293, + "num_tokens": 1504433.0, + "reward": 2.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 0.25, + "step": 5086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 94.20370370370371, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.659715414047241, + "kl": 0.15037638694047928, + "learning_rate": 1.6380000000000002e-06, + "loss": -0.0188, + "num_tokens": 1504773.0, + "reward": 3.0, + "reward_std": 3.674234628677368, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 3.674234628677368, + "step": 5087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 94.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03332719951868057, + "kl": 0.004549508390482515, + "learning_rate": 1.6376666666666666e-06, + "loss": 0.0002, + "num_tokens": 1505075.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 94.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11035460978746414, + "kl": 0.022511586954351515, + "learning_rate": 1.6373333333333334e-06, + "loss": 0.0011, + "num_tokens": 1505341.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 94.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07171207666397095, + "kl": 0.011405151803046465, + "learning_rate": 1.637e-06, + "loss": 0.0006, + "num_tokens": 1505671.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 94.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1093517541885376, + "kl": 0.006516335415653884, + "learning_rate": 1.6366666666666667e-06, + "loss": 0.0003, + "num_tokens": 1505995.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 94.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038217782974243164, + "kl": 0.005558681208640337, + "learning_rate": 1.6363333333333333e-06, + "loss": 0.0003, + "num_tokens": 1506285.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 94.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05555345118045807, + "kl": 0.001620567578356713, + "learning_rate": 1.636e-06, + "loss": 0.0001, + "num_tokens": 1506581.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 94.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09022354334592819, + "kl": 0.0028458016458898783, + "learning_rate": 1.6356666666666669e-06, + "loss": 0.0001, + "num_tokens": 1506855.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 94.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07398438453674316, + "kl": 0.0136982761323452, + "learning_rate": 1.6353333333333334e-06, + "loss": 0.0007, + "num_tokens": 1507179.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 94.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03835136443376541, + "kl": 0.0019263034919276834, + "learning_rate": 1.6350000000000002e-06, + "loss": 0.0001, + "num_tokens": 1507500.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 94.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07044315338134766, + "kl": 0.0024491348303854465, + "learning_rate": 1.6346666666666666e-06, + "loss": 0.0001, + "num_tokens": 1507734.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 94.4074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.664107084274292, + "kl": 0.009934463538229465, + "learning_rate": 1.6343333333333334e-06, + "loss": 0.0353, + "num_tokens": 1508023.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 5098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 94.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012818897375836968, + "kl": 0.00024852753267623484, + "learning_rate": 1.634e-06, + "loss": 0.0, + "num_tokens": 1508243.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 94.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07100926339626312, + "kl": 0.01244360813871026, + "learning_rate": 1.6336666666666667e-06, + "loss": 0.0006, + "num_tokens": 1508529.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 64.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 136.0, + "completions/max_terminated_length": 136.0, + "completions/mean_length": 64.75, + "completions/mean_terminated_length": 64.75, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 94.46296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1297500133514404, + "kl": 0.0583345852792263, + "learning_rate": 1.6333333333333333e-06, + "loss": 0.2886, + "num_tokens": 1509004.0, + "reward": 6.375, + "reward_std": 3.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 3.25, + "step": 5101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 94.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0022757509723305702, + "kl": 0.00015012547373771667, + "learning_rate": 1.633e-06, + "loss": 0.0, + "num_tokens": 1509248.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 94.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05654054507613182, + "kl": 0.011175133055076003, + "learning_rate": 1.6326666666666669e-06, + "loss": 0.0006, + "num_tokens": 1509590.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 94.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03896370530128479, + "kl": 0.00512346881441772, + "learning_rate": 1.6323333333333334e-06, + "loss": 0.0003, + "num_tokens": 1509859.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 94.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008043305017054081, + "kl": 0.07291961647570133, + "learning_rate": 1.6320000000000002e-06, + "loss": 0.0037, + "num_tokens": 1510229.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 94.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012667941860854626, + "kl": 0.002352175652049482, + "learning_rate": 1.6316666666666666e-06, + "loss": 0.0001, + "num_tokens": 1510506.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 94.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0038390362169593573, + "kl": 0.26831941306591034, + "learning_rate": 1.6313333333333334e-06, + "loss": 0.0134, + "num_tokens": 1510810.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 36.5, + "completions/mean_terminated_length": 36.5, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 94.5925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.051037311553955, + "kl": 0.17304487526416779, + "learning_rate": 1.631e-06, + "loss": -0.0328, + "num_tokens": 1511184.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 5108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 94.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16897113621234894, + "kl": 0.1642039492726326, + "learning_rate": 1.6306666666666667e-06, + "loss": 0.0082, + "num_tokens": 1511498.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 94.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006400995887815952, + "kl": 0.00047546329733449966, + "learning_rate": 1.6303333333333333e-06, + "loss": 0.0, + "num_tokens": 1511758.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 94.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053089018911123276, + "kl": 0.0031430646777153015, + "learning_rate": 1.63e-06, + "loss": 0.0001, + "num_tokens": 1512012.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 94.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0484151653945446, + "kl": 0.027952161617577076, + "learning_rate": 1.6296666666666668e-06, + "loss": 0.0014, + "num_tokens": 1512286.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 23.75, + "completions/mean_terminated_length": 23.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 94.68518518518519, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.124202251434326, + "kl": 0.05679469741880894, + "learning_rate": 1.6293333333333334e-06, + "loss": 0.0574, + "num_tokens": 1512609.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 5113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 35.0, + "completions/mean_terminated_length": 35.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 94.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042268916964530945, + "kl": 0.02734996471554041, + "learning_rate": 1.6290000000000002e-06, + "loss": 0.0015, + "num_tokens": 1512973.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 94.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10678981244564056, + "kl": 0.051696695387363434, + "learning_rate": 1.6286666666666666e-06, + "loss": 0.0023, + "num_tokens": 1513327.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 94.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02062048763036728, + "kl": 0.000997929397271946, + "learning_rate": 1.6283333333333336e-06, + "loss": 0.0, + "num_tokens": 1513636.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5116 + }, + { + "clip_ratio/high_max": 0.007936508394777775, + "clip_ratio/high_mean": 0.007936508394777775, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007936508394777775, + "completion_length": 39.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 39.75, + "completions/mean_terminated_length": 39.75, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 94.75925925925925, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1902663707733154, + "kl": 0.03688059840351343, + "learning_rate": 1.628e-06, + "loss": 0.1702, + "num_tokens": 1514015.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 5117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 94.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01191367581486702, + "kl": 0.014215979259461164, + "learning_rate": 1.6276666666666667e-06, + "loss": 0.0007, + "num_tokens": 1514275.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 94.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06005249172449112, + "kl": 0.009833820164203644, + "learning_rate": 1.6273333333333333e-06, + "loss": 0.0005, + "num_tokens": 1514545.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 94.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4023919403553009, + "kl": 0.029481276869773865, + "learning_rate": 1.627e-06, + "loss": 0.0015, + "num_tokens": 1514753.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.25, + "completions/mean_terminated_length": 12.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 94.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06025628000497818, + "kl": 0.011574001866392791, + "learning_rate": 1.6266666666666668e-06, + "loss": 0.0005, + "num_tokens": 1515026.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 94.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032000113278627396, + "kl": 0.000591340649407357, + "learning_rate": 1.6263333333333334e-06, + "loss": 0.0, + "num_tokens": 1515282.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 94.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019223337585572153, + "kl": 6.541609764099121e-06, + "learning_rate": 1.6260000000000002e-06, + "loss": 0.0, + "num_tokens": 1515502.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 5123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 94.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007296347175724804, + "kl": 0.003776274621486664, + "learning_rate": 1.6256666666666665e-06, + "loss": 0.0002, + "num_tokens": 1515738.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 94.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10547690838575363, + "kl": 0.00818365067243576, + "learning_rate": 1.6253333333333335e-06, + "loss": 0.0004, + "num_tokens": 1515954.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 94.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031101077795028687, + "kl": 0.0028826892375946045, + "learning_rate": 1.625e-06, + "loss": 0.0001, + "num_tokens": 1516166.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 94.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014484002254903316, + "kl": 0.0019085241947323084, + "learning_rate": 1.6246666666666667e-06, + "loss": 0.0001, + "num_tokens": 1516480.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 94.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21377907693386078, + "kl": 0.04807031853124499, + "learning_rate": 1.6243333333333333e-06, + "loss": 0.0024, + "num_tokens": 1516770.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 94.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3453080952167511, + "kl": 0.02882286161184311, + "learning_rate": 1.624e-06, + "loss": 0.002, + "num_tokens": 1517039.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 95.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12480010837316513, + "kl": 0.017304659821093082, + "learning_rate": 1.6236666666666668e-06, + "loss": 0.0009, + "num_tokens": 1517332.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 95.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010862167924642563, + "kl": 0.00013622641017718706, + "learning_rate": 1.6233333333333334e-06, + "loss": 0.0, + "num_tokens": 1517588.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 11.75, + "completions/mean_terminated_length": 11.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 95.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13647964596748352, + "kl": 0.013996335212141275, + "learning_rate": 1.6230000000000002e-06, + "loss": 0.0008, + "num_tokens": 1517855.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 95.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04975675791501999, + "kl": 0.011340032564476132, + "learning_rate": 1.6226666666666665e-06, + "loss": 0.0006, + "num_tokens": 1518133.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 95.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015183904208242893, + "kl": 0.0004988627406419255, + "learning_rate": 1.6223333333333335e-06, + "loss": 0.0, + "num_tokens": 1518442.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.25, + "completions/mean_terminated_length": 12.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 95.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.36430010199546814, + "kl": 0.02252253331243992, + "learning_rate": 1.6219999999999999e-06, + "loss": 0.0014, + "num_tokens": 1518711.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 95.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011199631728231907, + "kl": 0.014363312162458897, + "learning_rate": 1.6216666666666667e-06, + "loss": 0.0007, + "num_tokens": 1518971.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 95.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05741075798869133, + "kl": 0.008718229364603758, + "learning_rate": 1.6213333333333332e-06, + "loss": 0.0004, + "num_tokens": 1519244.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 95.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003971273545175791, + "kl": 0.2682976573705673, + "learning_rate": 1.621e-06, + "loss": 0.0134, + "num_tokens": 1519548.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 95.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07563067972660065, + "kl": 0.018135390244424343, + "learning_rate": 1.6206666666666668e-06, + "loss": 0.0009, + "num_tokens": 1519840.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 95.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039467714726924896, + "kl": 0.001513257622718811, + "learning_rate": 1.6203333333333334e-06, + "loss": 0.0001, + "num_tokens": 1520084.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 95.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03072650358080864, + "kl": 0.002341926097869873, + "learning_rate": 1.6200000000000002e-06, + "loss": 0.0001, + "num_tokens": 1520296.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 95.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15167327225208282, + "kl": 0.014023125171661377, + "learning_rate": 1.6196666666666667e-06, + "loss": 0.0007, + "num_tokens": 1520508.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 95.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0114447558298707, + "kl": 0.0013392396504059434, + "learning_rate": 1.6193333333333335e-06, + "loss": 0.0001, + "num_tokens": 1520806.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 95.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03400212526321411, + "kl": 0.0018738221260719001, + "learning_rate": 1.6189999999999999e-06, + "loss": 0.0001, + "num_tokens": 1521096.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 95.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08252135664224625, + "kl": 0.00547359639313072, + "learning_rate": 1.6186666666666667e-06, + "loss": 0.0003, + "num_tokens": 1521362.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 95.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09761747717857361, + "kl": 0.014867460820823908, + "learning_rate": 1.6183333333333332e-06, + "loss": 0.0008, + "num_tokens": 1521686.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 24.25, + "completions/mean_terminated_length": 24.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 95.31481481481481, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.120587348937988, + "kl": 0.16405999660491943, + "learning_rate": 1.618e-06, + "loss": -0.2575, + "num_tokens": 1522003.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 5147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 95.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12070372700691223, + "kl": 0.17373964935541153, + "learning_rate": 1.6176666666666668e-06, + "loss": 0.0088, + "num_tokens": 1522318.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 95.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0024428071919828653, + "kl": 0.0003391765058040619, + "learning_rate": 1.6173333333333334e-06, + "loss": 0.0, + "num_tokens": 1522578.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 95.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07362038642168045, + "kl": 0.0026927399449050426, + "learning_rate": 1.6170000000000001e-06, + "loss": 0.0001, + "num_tokens": 1522812.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 95.38888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.200491428375244, + "kl": 0.04430763237178326, + "learning_rate": 1.6166666666666667e-06, + "loss": 0.1794, + "num_tokens": 1523098.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 5151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 95.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03638202324509621, + "kl": 0.006569494726136327, + "learning_rate": 1.6163333333333335e-06, + "loss": 0.0003, + "num_tokens": 1523432.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 27.75, + "completions/mean_terminated_length": 27.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 95.42592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.450490713119507, + "kl": 0.07818220183253288, + "learning_rate": 1.6159999999999999e-06, + "loss": -0.1063, + "num_tokens": 1523795.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 5153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 38.75, + "completions/mean_terminated_length": 38.75, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 95.44444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0378687381744385, + "kl": 0.06665090471506119, + "learning_rate": 1.6156666666666666e-06, + "loss": 0.0043, + "num_tokens": 1524178.0, + "reward": 5.75, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 2.598076105117798, + "step": 5154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 95.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03414076566696167, + "kl": 0.000407390296459198, + "learning_rate": 1.6153333333333332e-06, + "loss": 0.0, + "num_tokens": 1524390.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 95.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12677936255931854, + "kl": 0.02297324687242508, + "learning_rate": 1.615e-06, + "loss": 0.0012, + "num_tokens": 1524724.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 95.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05070136860013008, + "kl": 0.004233626881614327, + "learning_rate": 1.6146666666666668e-06, + "loss": 0.0002, + "num_tokens": 1525028.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 95.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008868768927641213, + "kl": 0.0037405937910079956, + "learning_rate": 1.6143333333333333e-06, + "loss": 0.0002, + "num_tokens": 1525264.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 95.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005571421701461077, + "kl": 0.001623174932319671, + "learning_rate": 1.6140000000000001e-06, + "loss": 0.0001, + "num_tokens": 1525544.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 95.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09620469808578491, + "kl": 0.013978281989693642, + "learning_rate": 1.6136666666666667e-06, + "loss": 0.0007, + "num_tokens": 1525856.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 95.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.103268563747406, + "kl": 0.011964778881520033, + "learning_rate": 1.6133333333333335e-06, + "loss": 0.0006, + "num_tokens": 1526164.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 95.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.62724769115448, + "kl": 0.08902581129223108, + "learning_rate": 1.6129999999999998e-06, + "loss": 0.0044, + "num_tokens": 1526470.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 54.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 98.0, + "completions/max_terminated_length": 98.0, + "completions/mean_length": 54.75, + "completions/mean_terminated_length": 54.75, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 95.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4955475330352783, + "kl": 0.11508592963218689, + "learning_rate": 1.6126666666666666e-06, + "loss": 0.1889, + "num_tokens": 1526905.0, + "reward": 5.75, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 4.5, + "step": 5163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 95.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.25937536358833313, + "kl": 0.04300510138273239, + "learning_rate": 1.6123333333333332e-06, + "loss": 0.0022, + "num_tokens": 1527204.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 28.5, + "completions/mean_terminated_length": 28.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 95.64814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5185606479644775, + "kl": 0.08510687947273254, + "learning_rate": 1.612e-06, + "loss": 0.0502, + "num_tokens": 1527554.0, + "reward": 4.25, + "reward_std": 4.27200174331665, + "rewards/reward_combined/mean": 4.25, + "rewards/reward_combined/std": 4.27200174331665, + "step": 5165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 95.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03310784697532654, + "kl": 0.0032910079462453723, + "learning_rate": 1.6116666666666668e-06, + "loss": 0.0002, + "num_tokens": 1527814.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 95.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22392171621322632, + "kl": 0.023513258900493383, + "learning_rate": 1.6113333333333333e-06, + "loss": 0.0012, + "num_tokens": 1528090.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 95.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0042220475152134895, + "kl": 0.0002702832280192524, + "learning_rate": 1.6110000000000001e-06, + "loss": 0.0, + "num_tokens": 1528310.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 95.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055837348103523254, + "kl": 0.0033182734914589673, + "learning_rate": 1.6106666666666667e-06, + "loss": 0.0001, + "num_tokens": 1528584.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 95.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4184170067310333, + "kl": 0.06885102717205882, + "learning_rate": 1.6103333333333335e-06, + "loss": 0.0032, + "num_tokens": 1528911.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 95.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16252626478672028, + "kl": 0.05184198170900345, + "learning_rate": 1.6099999999999998e-06, + "loss": 0.0027, + "num_tokens": 1529216.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 95.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03228530287742615, + "kl": 0.005028032814152539, + "learning_rate": 1.6096666666666668e-06, + "loss": 0.0003, + "num_tokens": 1529498.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 95.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00021318643121048808, + "kl": 7.450580596923828e-06, + "learning_rate": 1.6093333333333332e-06, + "loss": 0.0, + "num_tokens": 1529718.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 5173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 95.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12534946203231812, + "kl": 0.020774316042661667, + "learning_rate": 1.609e-06, + "loss": 0.001, + "num_tokens": 1529993.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 95.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024069350212812424, + "kl": 0.0032211471116170287, + "learning_rate": 1.6086666666666668e-06, + "loss": 0.0002, + "num_tokens": 1530321.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 95.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03737330436706543, + "kl": 0.10044170543551445, + "learning_rate": 1.6083333333333333e-06, + "loss": 0.005, + "num_tokens": 1530693.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 34.5, + "completions/mean_terminated_length": 34.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 95.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0907745435833931, + "kl": 0.04705662652850151, + "learning_rate": 1.608e-06, + "loss": 0.0023, + "num_tokens": 1531055.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 95.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016896385699510574, + "kl": 0.0008772032451815903, + "learning_rate": 1.6076666666666667e-06, + "loss": 0.0, + "num_tokens": 1531372.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 95.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.31291264295578003, + "kl": 0.02850928157567978, + "learning_rate": 1.6073333333333335e-06, + "loss": 0.0015, + "num_tokens": 1531646.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 95.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1370113044977188, + "kl": 0.017792532220482826, + "learning_rate": 1.6069999999999998e-06, + "loss": 0.0009, + "num_tokens": 1531934.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 95.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06672362238168716, + "kl": 0.0038387924432754517, + "learning_rate": 1.6066666666666668e-06, + "loss": 0.0002, + "num_tokens": 1532150.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 95.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05014263838529587, + "kl": 0.01246769493445754, + "learning_rate": 1.6063333333333332e-06, + "loss": 0.0006, + "num_tokens": 1532444.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 95.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01600436121225357, + "kl": 0.0005118479311931878, + "learning_rate": 1.606e-06, + "loss": 0.0, + "num_tokens": 1532714.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 96.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03981270268559456, + "kl": 0.056245286017656326, + "learning_rate": 1.6056666666666667e-06, + "loss": 0.0028, + "num_tokens": 1533119.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 5184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 96.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.256351470947266, + "kl": 0.7909174561500549, + "learning_rate": 1.6053333333333333e-06, + "loss": 0.0387, + "num_tokens": 1533422.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 96.03703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.734254360198975, + "kl": 0.1715548001229763, + "learning_rate": 1.605e-06, + "loss": 0.0486, + "num_tokens": 1533720.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 5186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 35.0, + "completions/mean_terminated_length": 35.0, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 96.05555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4434525966644287, + "kl": 0.1689465567469597, + "learning_rate": 1.6046666666666667e-06, + "loss": -0.0827, + "num_tokens": 1534088.0, + "reward": 7.75, + "reward_std": 0.28867512941360474, + "rewards/reward_combined/mean": 7.75, + "rewards/reward_combined/std": 0.28867512941360474, + "step": 5187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 96.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03285718336701393, + "kl": 0.0008119975100271404, + "learning_rate": 1.6043333333333334e-06, + "loss": 0.0, + "num_tokens": 1534352.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 96.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1380830556154251, + "kl": 0.025421341881155968, + "learning_rate": 1.604e-06, + "loss": 0.0013, + "num_tokens": 1534696.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 96.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5371273159980774, + "kl": 0.13559747487306595, + "learning_rate": 1.6036666666666668e-06, + "loss": 0.0069, + "num_tokens": 1535036.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 96.12962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.086024284362793, + "kl": 0.03677371144294739, + "learning_rate": 1.6033333333333336e-06, + "loss": 0.1202, + "num_tokens": 1535308.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 5191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 96.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008534154039807618, + "kl": 0.0037450119853019714, + "learning_rate": 1.603e-06, + "loss": 0.0002, + "num_tokens": 1535544.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 96.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03920384496450424, + "kl": 0.007517733611166477, + "learning_rate": 1.602666666666667e-06, + "loss": 0.0003, + "num_tokens": 1535836.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 7.0, + "completions/mean_terminated_length": 7.0, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 96.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013419232331216335, + "kl": 0.0004963747051078826, + "learning_rate": 1.6023333333333333e-06, + "loss": 0.0, + "num_tokens": 1536072.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 96.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020311851403675973, + "kl": 7.27921724319458e-06, + "learning_rate": 1.602e-06, + "loss": 0.0, + "num_tokens": 1536292.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 5195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 96.22222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9683189392089844, + "kl": 0.028609320521354675, + "learning_rate": 1.6016666666666666e-06, + "loss": 0.0539, + "num_tokens": 1536586.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 5196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.0, + "completions/max_terminated_length": 46.0, + "completions/mean_length": 28.25, + "completions/mean_terminated_length": 28.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 96.24074074074075, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.197132587432861, + "kl": 0.05988541431725025, + "learning_rate": 1.6013333333333334e-06, + "loss": 0.236, + "num_tokens": 1536951.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 5197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 29.75, + "completions/mean_terminated_length": 29.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 96.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23637382686138153, + "kl": 0.04909998178482056, + "learning_rate": 1.601e-06, + "loss": 0.0026, + "num_tokens": 1537290.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 96.27777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.124506950378418, + "kl": 0.01471470925025642, + "learning_rate": 1.6006666666666668e-06, + "loss": 0.0452, + "num_tokens": 1537576.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 5199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 96.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13501040637493134, + "kl": 0.014340505935251713, + "learning_rate": 1.6003333333333336e-06, + "loss": 0.0007, + "num_tokens": 1537848.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 96.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03232679143548012, + "kl": 0.0013084628735668957, + "learning_rate": 1.6e-06, + "loss": 0.0001, + "num_tokens": 1538144.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 96.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008099677972495556, + "kl": 0.0001684397502685897, + "learning_rate": 1.599666666666667e-06, + "loss": 0.0, + "num_tokens": 1538400.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 96.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07905364036560059, + "kl": 0.016734112985432148, + "learning_rate": 1.5993333333333333e-06, + "loss": 0.0009, + "num_tokens": 1538734.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 96.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10110728442668915, + "kl": 0.014798803720623255, + "learning_rate": 1.599e-06, + "loss": 0.0007, + "num_tokens": 1539025.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 2.5, + "completions/mean_terminated_length": 2.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 96.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11827294528484344, + "kl": 0.0057364702224731445, + "learning_rate": 1.5986666666666666e-06, + "loss": 0.0003, + "num_tokens": 1539231.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 96.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02143985405564308, + "kl": 0.004726713988929987, + "learning_rate": 1.5983333333333334e-06, + "loss": 0.0002, + "num_tokens": 1539489.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 96.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05490705743432045, + "kl": 0.016320059075951576, + "learning_rate": 1.598e-06, + "loss": 0.0008, + "num_tokens": 1539788.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 96.44444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.003941059112549, + "kl": 0.01771488878875971, + "learning_rate": 1.5976666666666668e-06, + "loss": 0.0905, + "num_tokens": 1540082.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 5208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 96.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07804497331380844, + "kl": 0.0273137129843235, + "learning_rate": 1.5973333333333336e-06, + "loss": 0.0014, + "num_tokens": 1540354.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 96.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043495986610651016, + "kl": 0.007365534518612549, + "learning_rate": 1.597e-06, + "loss": 0.0004, + "num_tokens": 1540624.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 96.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09378044307231903, + "kl": 0.007655891356989741, + "learning_rate": 1.596666666666667e-06, + "loss": 0.0004, + "num_tokens": 1540926.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 96.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027063585817813873, + "kl": 0.0014878429647069424, + "learning_rate": 1.5963333333333333e-06, + "loss": 0.0001, + "num_tokens": 1541238.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 33.75, + "completions/mean_terminated_length": 33.75, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 96.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1158662810921669, + "kl": 0.061968524008989334, + "learning_rate": 1.596e-06, + "loss": 0.0031, + "num_tokens": 1541589.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 96.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020407572388648987, + "kl": 0.09662141278386116, + "learning_rate": 1.5956666666666666e-06, + "loss": 0.0048, + "num_tokens": 1541961.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 96.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030843282118439674, + "kl": 0.0003384128212928772, + "learning_rate": 1.5953333333333334e-06, + "loss": 0.0, + "num_tokens": 1542173.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 96.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029449859634041786, + "kl": 0.0008158758282661438, + "learning_rate": 1.595e-06, + "loss": 0.0, + "num_tokens": 1542433.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 36.75, + "completions/mean_terminated_length": 36.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 96.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04036524146795273, + "kl": 0.018667737022042274, + "learning_rate": 1.5946666666666668e-06, + "loss": 0.0009, + "num_tokens": 1542804.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 96.62962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.319491386413574, + "kl": 0.020450257696211338, + "learning_rate": 1.5943333333333335e-06, + "loss": 0.1355, + "num_tokens": 1543097.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 5218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 96.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023998115211725235, + "kl": 0.0037993593141436577, + "learning_rate": 1.5940000000000001e-06, + "loss": 0.0002, + "num_tokens": 1543429.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 96.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029288535937666893, + "kl": 0.0012310373422224075, + "learning_rate": 1.593666666666667e-06, + "loss": 0.0001, + "num_tokens": 1543753.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 96.68518518518519, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.0808610916137695, + "kl": 0.0095128309330903, + "learning_rate": 1.5933333333333333e-06, + "loss": 0.0633, + "num_tokens": 1544037.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 5221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 96.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05988513305783272, + "kl": 0.014316507615149021, + "learning_rate": 1.593e-06, + "loss": 0.0008, + "num_tokens": 1544311.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 96.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010522710159420967, + "kl": 0.0015555593417957425, + "learning_rate": 1.5926666666666666e-06, + "loss": 0.0001, + "num_tokens": 1544579.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 96.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1397678554058075, + "kl": 0.016374513506889343, + "learning_rate": 1.5923333333333334e-06, + "loss": 0.0008, + "num_tokens": 1544901.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 96.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006504387594759464, + "kl": 0.0015593841671943665, + "learning_rate": 1.592e-06, + "loss": 0.0001, + "num_tokens": 1545117.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 96.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05867267772555351, + "kl": 0.0030096396803855896, + "learning_rate": 1.5916666666666667e-06, + "loss": 0.0002, + "num_tokens": 1545366.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 96.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1795063614845276, + "kl": 0.00440611457452178, + "learning_rate": 1.5913333333333335e-06, + "loss": 0.0003, + "num_tokens": 1545586.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 96.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08656342327594757, + "kl": 0.009550884831696749, + "learning_rate": 1.591e-06, + "loss": 0.0005, + "num_tokens": 1545913.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 96.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012767196167260408, + "kl": 0.00024462938745273277, + "learning_rate": 1.5906666666666669e-06, + "loss": 0.0, + "num_tokens": 1546133.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 96.85185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.990260601043701, + "kl": 0.10891515691764653, + "learning_rate": 1.5903333333333332e-06, + "loss": 0.1837, + "num_tokens": 1546491.0, + "reward": 6.125, + "reward_std": 3.75, + "rewards/reward_combined/mean": 6.125, + "rewards/reward_combined/std": 3.75, + "step": 5230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 96.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027175987139344215, + "kl": 0.009174252860248089, + "learning_rate": 1.59e-06, + "loss": 0.0005, + "num_tokens": 1546765.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 96.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0720847100019455, + "kl": 0.0019926356617361307, + "learning_rate": 1.5896666666666666e-06, + "loss": 0.0001, + "num_tokens": 1547035.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 96.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2600727081298828, + "kl": 0.04558689740952104, + "learning_rate": 1.5893333333333334e-06, + "loss": 0.0025, + "num_tokens": 1547297.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 96.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0636097639799118, + "kl": 0.016690427903085947, + "learning_rate": 1.589e-06, + "loss": 0.0008, + "num_tokens": 1547559.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 96.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10395558178424835, + "kl": 0.03940185043029487, + "learning_rate": 1.5886666666666667e-06, + "loss": 0.002, + "num_tokens": 1547849.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 96.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11616316437721252, + "kl": 0.05716337263584137, + "learning_rate": 1.5883333333333335e-06, + "loss": 0.0028, + "num_tokens": 1548253.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 5236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 96.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019060350954532623, + "kl": 0.0007412591949105263, + "learning_rate": 1.588e-06, + "loss": 0.0, + "num_tokens": 1548565.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 97.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015772633254528046, + "kl": 0.16136373579502106, + "learning_rate": 1.5876666666666669e-06, + "loss": 0.0081, + "num_tokens": 1548873.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 97.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05440906435251236, + "kl": 0.008499347837641835, + "learning_rate": 1.5873333333333332e-06, + "loss": 0.0004, + "num_tokens": 1549161.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 97.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039005979895591736, + "kl": 0.0015152791747823358, + "learning_rate": 1.5870000000000002e-06, + "loss": 0.0001, + "num_tokens": 1549431.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 97.05555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.855672836303711, + "kl": 0.0875339973717928, + "learning_rate": 1.5866666666666666e-06, + "loss": -0.0825, + "num_tokens": 1549757.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 5241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 97.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025368299335241318, + "kl": 0.0020077545195817947, + "learning_rate": 1.5863333333333334e-06, + "loss": 0.0001, + "num_tokens": 1550010.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 97.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024747377261519432, + "kl": 0.005036524264141917, + "learning_rate": 1.586e-06, + "loss": 0.0003, + "num_tokens": 1550346.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 97.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.32676878571510315, + "kl": 0.08721771091222763, + "learning_rate": 1.5856666666666667e-06, + "loss": 0.0047, + "num_tokens": 1550640.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 40.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 40.0, + "completions/mean_terminated_length": 40.0, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 97.12962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2946219444274902, + "kl": 0.032050661742687225, + "learning_rate": 1.5853333333333335e-06, + "loss": -0.0327, + "num_tokens": 1551024.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 5245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 97.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1397843360900879, + "kl": 0.03741133585572243, + "learning_rate": 1.585e-06, + "loss": 0.0019, + "num_tokens": 1551308.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 36.5, + "completions/mean_terminated_length": 36.5, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 97.16666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7527925968170166, + "kl": 0.08169154822826385, + "learning_rate": 1.5846666666666669e-06, + "loss": -0.0154, + "num_tokens": 1551670.0, + "reward": 5.75, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 4.5, + "step": 5247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 97.18518518518519, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2162325382232666, + "kl": 0.029286948963999748, + "learning_rate": 1.5843333333333332e-06, + "loss": 0.0234, + "num_tokens": 1551964.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 5248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 97.20370370370371, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15857195854187, + "kl": 0.5796295739710331, + "learning_rate": 1.5840000000000002e-06, + "loss": 0.019, + "num_tokens": 1552259.0, + "reward": 5.75, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 4.5, + "step": 5249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 97.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08338232338428497, + "kl": 0.009691339917480946, + "learning_rate": 1.5836666666666666e-06, + "loss": 0.0005, + "num_tokens": 1552525.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 97.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014464963169302791, + "kl": 4.537403583526611e-06, + "learning_rate": 1.5833333333333333e-06, + "loss": 0.0, + "num_tokens": 1552745.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 5251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 97.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011426261626183987, + "kl": 0.01420259429141879, + "learning_rate": 1.583e-06, + "loss": 0.0007, + "num_tokens": 1553005.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 27.25, + "completions/mean_terminated_length": 27.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 97.27777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.811831474304199, + "kl": 0.12035173550248146, + "learning_rate": 1.5826666666666667e-06, + "loss": 0.0109, + "num_tokens": 1553334.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 5253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 97.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0537535697221756, + "kl": 0.009263205574825406, + "learning_rate": 1.5823333333333335e-06, + "loss": 0.0005, + "num_tokens": 1553662.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 97.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012192741967737675, + "kl": 0.008494176901876926, + "learning_rate": 1.582e-06, + "loss": 0.0004, + "num_tokens": 1553934.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 97.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01998489536345005, + "kl": 0.0010051537537947297, + "learning_rate": 1.5816666666666668e-06, + "loss": 0.0001, + "num_tokens": 1554230.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 27.25, + "completions/mean_terminated_length": 27.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 97.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05653964355587959, + "kl": 0.012128156144171953, + "learning_rate": 1.5813333333333332e-06, + "loss": 0.0007, + "num_tokens": 1554567.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 97.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030683143064379692, + "kl": 0.0006210058927536011, + "learning_rate": 1.5810000000000002e-06, + "loss": 0.0, + "num_tokens": 1554779.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 97.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.8385437726974487, + "kl": 0.21021639555692673, + "learning_rate": 1.5806666666666666e-06, + "loss": 0.0116, + "num_tokens": 1555119.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 97.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.064893439412117, + "kl": 0.0026374012231826782, + "learning_rate": 1.5803333333333333e-06, + "loss": 0.0001, + "num_tokens": 1555331.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 34.5, + "completions/mean_terminated_length": 34.5, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 97.42592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3022162914276123, + "kl": 0.3044265806674957, + "learning_rate": 1.58e-06, + "loss": -0.0696, + "num_tokens": 1555697.0, + "reward": 6.75, + "reward_std": 2.1794495582580566, + "rewards/reward_combined/mean": 6.75, + "rewards/reward_combined/std": 2.1794495582580566, + "step": 5261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 97.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00520613556727767, + "kl": 0.0013129889848642051, + "learning_rate": 1.5796666666666667e-06, + "loss": 0.0001, + "num_tokens": 1555957.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 97.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4643997848033905, + "kl": 0.041804001142736524, + "learning_rate": 1.5793333333333335e-06, + "loss": 0.0024, + "num_tokens": 1556279.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 33.0, + "completions/mean_terminated_length": 33.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 97.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03763243183493614, + "kl": 0.015672972425818443, + "learning_rate": 1.579e-06, + "loss": 0.0007, + "num_tokens": 1556631.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 97.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019132385030388832, + "kl": 0.265506386756897, + "learning_rate": 1.5786666666666668e-06, + "loss": 0.0133, + "num_tokens": 1556935.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 97.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09526564180850983, + "kl": 0.016265312675386667, + "learning_rate": 1.5783333333333334e-06, + "loss": 0.0007, + "num_tokens": 1557259.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 97.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07023649662733078, + "kl": 0.006840124959126115, + "learning_rate": 1.5780000000000002e-06, + "loss": 0.0004, + "num_tokens": 1557568.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 97.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038880303502082825, + "kl": 0.15783193707466125, + "learning_rate": 1.5776666666666665e-06, + "loss": 0.0079, + "num_tokens": 1557878.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 97.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05422493815422058, + "kl": 0.0028571193106472492, + "learning_rate": 1.5773333333333333e-06, + "loss": 0.0001, + "num_tokens": 1558121.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 97.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06280789524316788, + "kl": 0.009660904761403799, + "learning_rate": 1.5769999999999999e-06, + "loss": 0.0005, + "num_tokens": 1558409.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 97.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03092641569674015, + "kl": 0.0037699388340115547, + "learning_rate": 1.5766666666666667e-06, + "loss": 0.0002, + "num_tokens": 1558693.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 97.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0648389682173729, + "kl": 0.10496380552649498, + "learning_rate": 1.5763333333333335e-06, + "loss": 0.0052, + "num_tokens": 1559065.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 97.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007291814312338829, + "kl": 0.00016025701916078106, + "learning_rate": 1.576e-06, + "loss": 0.0, + "num_tokens": 1559335.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 97.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008909560274332762, + "kl": 0.003739573061466217, + "learning_rate": 1.5756666666666668e-06, + "loss": 0.0002, + "num_tokens": 1559571.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 11.75, + "completions/mean_terminated_length": 11.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 97.68518518518519, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.143528938293457, + "kl": 0.029185396801040042, + "learning_rate": 1.5753333333333334e-06, + "loss": 0.1239, + "num_tokens": 1559838.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 5275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 97.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.853620171546936, + "kl": 0.10425485437735915, + "learning_rate": 1.5750000000000002e-06, + "loss": 0.0036, + "num_tokens": 1560158.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 97.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19300009310245514, + "kl": 0.027175567112863064, + "learning_rate": 1.5746666666666665e-06, + "loss": 0.0014, + "num_tokens": 1560441.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 97.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06380121409893036, + "kl": 0.0016522258520126343, + "learning_rate": 1.5743333333333333e-06, + "loss": 0.0001, + "num_tokens": 1560701.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.75, + "completions/mean_terminated_length": 11.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 97.75925925925925, + "frac_reward_zero_std": 0.0, + "grad_norm": 10.099404335021973, + "kl": 0.03659984492696822, + "learning_rate": 1.5739999999999999e-06, + "loss": 0.0654, + "num_tokens": 1560976.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 5279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 97.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019011495634913445, + "kl": 0.0007260367274284363, + "learning_rate": 1.5736666666666667e-06, + "loss": 0.0, + "num_tokens": 1561288.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 97.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010913548059761524, + "kl": 0.00045352215238381177, + "learning_rate": 1.5733333333333334e-06, + "loss": 0.0, + "num_tokens": 1561602.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 97.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052574340254068375, + "kl": 0.015412142500281334, + "learning_rate": 1.573e-06, + "loss": 0.0008, + "num_tokens": 1561904.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 97.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0987841784954071, + "kl": 0.003970506833866239, + "learning_rate": 1.5726666666666668e-06, + "loss": 0.0002, + "num_tokens": 1562117.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 7.5, + "completions/mean_terminated_length": 7.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 97.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21417739987373352, + "kl": 0.018260984565131366, + "learning_rate": 1.5723333333333334e-06, + "loss": 0.0011, + "num_tokens": 1562355.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 97.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02639508806169033, + "kl": 0.0065969196148216724, + "learning_rate": 1.5720000000000002e-06, + "loss": 0.0003, + "num_tokens": 1562647.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 30.0, + "completions/mean_terminated_length": 30.0, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 97.88888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2457984685897827, + "kl": 0.4335959553718567, + "learning_rate": 1.5716666666666665e-06, + "loss": -0.0131, + "num_tokens": 1563047.0, + "reward": 2.375, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 1.25, + "step": 5286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 97.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001239181961864233, + "kl": 0.0012553312699310482, + "learning_rate": 1.5713333333333333e-06, + "loss": 0.0001, + "num_tokens": 1563327.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 23.25, + "completions/mean_terminated_length": 23.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 97.92592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.417866230010986, + "kl": 0.11619619559496641, + "learning_rate": 1.5709999999999999e-06, + "loss": 0.1439, + "num_tokens": 1563672.0, + "reward": 4.625, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 5288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 97.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01846601627767086, + "kl": 0.002122808597050607, + "learning_rate": 1.5706666666666666e-06, + "loss": 0.0001, + "num_tokens": 1563940.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 50.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 165.0, + "completions/max_terminated_length": 165.0, + "completions/mean_length": 50.25, + "completions/mean_terminated_length": 50.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 97.96296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5261380672454834, + "kl": 0.006361888954415917, + "learning_rate": 1.5703333333333334e-06, + "loss": 0.4192, + "num_tokens": 1564365.0, + "reward": 7.425000190734863, + "reward_std": 0.15000009536743164, + "rewards/reward_combined/mean": 7.425000190734863, + "rewards/reward_combined/std": 0.15000009536743164, + "step": 5290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 97.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052766088396310806, + "kl": 0.0031872778199613094, + "learning_rate": 1.57e-06, + "loss": 0.0001, + "num_tokens": 1564582.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 98.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014063029550015926, + "kl": 0.00013606548600364476, + "learning_rate": 1.5696666666666668e-06, + "loss": 0.0, + "num_tokens": 1564838.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 98.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024823691695928574, + "kl": 0.00239275477360934, + "learning_rate": 1.5693333333333334e-06, + "loss": 0.0001, + "num_tokens": 1565136.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 98.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03821438178420067, + "kl": 0.03898667357861996, + "learning_rate": 1.5690000000000001e-06, + "loss": 0.0019, + "num_tokens": 1565540.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 5294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 98.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04019486531615257, + "kl": 0.005967809120193124, + "learning_rate": 1.5686666666666665e-06, + "loss": 0.0003, + "num_tokens": 1565867.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 98.07407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.191434383392334, + "kl": 0.09390311315655708, + "learning_rate": 1.5683333333333335e-06, + "loss": -0.0424, + "num_tokens": 1566195.0, + "reward": 5.25, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 4.5, + "step": 5296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 98.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06939195841550827, + "kl": 0.0035546133294701576, + "learning_rate": 1.5679999999999999e-06, + "loss": 0.0002, + "num_tokens": 1566459.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 98.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037322599440813065, + "kl": 0.09946007654070854, + "learning_rate": 1.5676666666666666e-06, + "loss": 0.005, + "num_tokens": 1566831.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 98.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016945242881774902, + "kl": 0.005102618131786585, + "learning_rate": 1.5673333333333334e-06, + "loss": 0.0003, + "num_tokens": 1567099.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 98.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02787545695900917, + "kl": 0.0010244142613373697, + "learning_rate": 1.567e-06, + "loss": 0.0001, + "num_tokens": 1567423.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 98.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1003614068031311, + "kl": 0.009341378579847515, + "learning_rate": 1.5666666666666668e-06, + "loss": 0.0005, + "num_tokens": 1567725.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 98.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02863270603120327, + "kl": 0.004132440779358149, + "learning_rate": 1.5663333333333333e-06, + "loss": 0.0002, + "num_tokens": 1568050.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 98.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11757971346378326, + "kl": 0.016049266327172518, + "learning_rate": 1.5660000000000001e-06, + "loss": 0.0008, + "num_tokens": 1568341.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.25, + "completions/mean_terminated_length": 6.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 98.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07015055418014526, + "kl": 0.0019031152478419244, + "learning_rate": 1.5656666666666665e-06, + "loss": 0.0001, + "num_tokens": 1568574.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 98.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013869311660528183, + "kl": 0.0009351014741696417, + "learning_rate": 1.5653333333333335e-06, + "loss": 0.0, + "num_tokens": 1568834.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 29.5, + "completions/mean_terminated_length": 29.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 98.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18689417839050293, + "kl": 0.026396608911454678, + "learning_rate": 1.5649999999999998e-06, + "loss": 0.0012, + "num_tokens": 1569172.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 98.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03671710565686226, + "kl": 0.004010175005532801, + "learning_rate": 1.5646666666666666e-06, + "loss": 0.0002, + "num_tokens": 1569461.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 98.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0615633949637413, + "kl": 0.004450254142284393, + "learning_rate": 1.5643333333333334e-06, + "loss": 0.0002, + "num_tokens": 1569677.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 98.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1403944343328476, + "kl": 0.008808583690552041, + "learning_rate": 1.564e-06, + "loss": 0.0004, + "num_tokens": 1569951.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 98.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014764757361263037, + "kl": 4.641711711883545e-06, + "learning_rate": 1.5636666666666668e-06, + "loss": 0.0, + "num_tokens": 1570171.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 5310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 98.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1486303061246872, + "kl": 0.02301202341914177, + "learning_rate": 1.5633333333333333e-06, + "loss": 0.0012, + "num_tokens": 1570482.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 42.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.0, + "completions/max_terminated_length": 46.0, + "completions/mean_length": 42.0, + "completions/mean_terminated_length": 42.0, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 98.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05841120705008507, + "kl": 0.02735324203968048, + "learning_rate": 1.5630000000000001e-06, + "loss": 0.0014, + "num_tokens": 1570874.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 98.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23246394097805023, + "kl": 0.04803896322846413, + "learning_rate": 1.5626666666666665e-06, + "loss": 0.0023, + "num_tokens": 1571188.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 98.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008211187086999416, + "kl": 0.0007003595528658479, + "learning_rate": 1.5623333333333335e-06, + "loss": 0.0, + "num_tokens": 1571408.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 98.42592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.275888919830322, + "kl": 0.02710882108658552, + "learning_rate": 1.5620000000000002e-06, + "loss": 0.2353, + "num_tokens": 1571703.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 5315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 98.44444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.344841003417969, + "kl": 0.6561076119542122, + "learning_rate": 1.5616666666666666e-06, + "loss": -0.2573, + "num_tokens": 1571999.0, + "reward": 6.125, + "reward_std": 3.75, + "rewards/reward_combined/mean": 6.125, + "rewards/reward_combined/std": 3.75, + "step": 5316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 71.5, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 71.5, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 98.46296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.028782606124878, + "kl": 0.008606459014117718, + "learning_rate": 1.5613333333333336e-06, + "loss": 0.4631, + "num_tokens": 1572505.0, + "reward": 7.300000190734863, + "reward_std": 0.40000009536743164, + "rewards/reward_combined/mean": 7.300000190734863, + "rewards/reward_combined/std": 0.40000009536743164, + "step": 5317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 45.0, + "completions/max_terminated_length": 45.0, + "completions/mean_length": 26.25, + "completions/mean_terminated_length": 26.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 98.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8740912079811096, + "kl": 0.11713118478655815, + "learning_rate": 1.561e-06, + "loss": 0.0064, + "num_tokens": 1572838.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 98.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006007605232298374, + "kl": 0.0003809332847595215, + "learning_rate": 1.5606666666666667e-06, + "loss": 0.0, + "num_tokens": 1573098.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 98.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012878204695880413, + "kl": 0.0017801049398258328, + "learning_rate": 1.5603333333333333e-06, + "loss": 0.0001, + "num_tokens": 1573375.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 98.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023811189457774162, + "kl": 0.2646760046482086, + "learning_rate": 1.56e-06, + "loss": 0.0132, + "num_tokens": 1573679.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 98.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006480825133621693, + "kl": 0.00012418627738952637, + "learning_rate": 1.5596666666666667e-06, + "loss": 0.0, + "num_tokens": 1573891.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 30.0, + "completions/mean_terminated_length": 30.0, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 98.57407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5577068328857422, + "kl": 0.049407415091991425, + "learning_rate": 1.5593333333333335e-06, + "loss": 0.0597, + "num_tokens": 1574239.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 5323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 98.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009676500223577023, + "kl": 0.0005857936921529472, + "learning_rate": 1.5590000000000002e-06, + "loss": 0.0, + "num_tokens": 1574501.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 98.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012871612794697285, + "kl": 0.0005100475536892191, + "learning_rate": 1.5586666666666666e-06, + "loss": 0.0, + "num_tokens": 1574810.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 98.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6796942353248596, + "kl": 0.0760381855070591, + "learning_rate": 1.5583333333333336e-06, + "loss": 0.0036, + "num_tokens": 1575090.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 98.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058911386877298355, + "kl": 0.0014866202618577518, + "learning_rate": 1.558e-06, + "loss": 0.0001, + "num_tokens": 1575346.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 98.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008437736541964114, + "kl": 0.003752976655960083, + "learning_rate": 1.5576666666666667e-06, + "loss": 0.0002, + "num_tokens": 1575582.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 98.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002823119517415762, + "kl": 0.00024881362332962453, + "learning_rate": 1.5573333333333333e-06, + "loss": 0.0, + "num_tokens": 1575802.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 98.70370370370371, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.0280938148498535, + "kl": 0.22838620003312826, + "learning_rate": 1.557e-06, + "loss": 0.0386, + "num_tokens": 1576111.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 5330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 98.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04869832843542099, + "kl": 0.020665702410042286, + "learning_rate": 1.5566666666666667e-06, + "loss": 0.0009, + "num_tokens": 1576464.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 98.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08788575232028961, + "kl": 0.0032032057642936707, + "learning_rate": 1.5563333333333334e-06, + "loss": 0.0002, + "num_tokens": 1576674.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 98.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016967543633654714, + "kl": 0.0001244927480001934, + "learning_rate": 1.5560000000000002e-06, + "loss": 0.0, + "num_tokens": 1576944.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 98.77777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.604962348937988, + "kl": 0.016524864826351404, + "learning_rate": 1.5556666666666666e-06, + "loss": 0.0472, + "num_tokens": 1577206.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 5334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 98.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3547906279563904, + "kl": 0.05260276701301336, + "learning_rate": 1.5553333333333336e-06, + "loss": 0.0031, + "num_tokens": 1577487.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 36.5, + "completions/mean_terminated_length": 36.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 98.81481481481481, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6202030181884766, + "kl": 0.04861114360392094, + "learning_rate": 1.555e-06, + "loss": 0.062, + "num_tokens": 1577849.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 5336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 98.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3343932330608368, + "kl": 0.022482444532215595, + "learning_rate": 1.5546666666666667e-06, + "loss": 0.001, + "num_tokens": 1578113.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 98.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05401170626282692, + "kl": 0.0022344777826219797, + "learning_rate": 1.5543333333333333e-06, + "loss": 0.0001, + "num_tokens": 1578362.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 98.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022278184071183205, + "kl": 0.003950534504838288, + "learning_rate": 1.554e-06, + "loss": 0.0002, + "num_tokens": 1578654.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 98.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07880939543247223, + "kl": 0.057072628289461136, + "learning_rate": 1.5536666666666666e-06, + "loss": 0.0029, + "num_tokens": 1578945.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 36.0, + "completions/mean_terminated_length": 36.0, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 98.9074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.069204092025757, + "kl": 0.09081784635782242, + "learning_rate": 1.5533333333333334e-06, + "loss": -0.0611, + "num_tokens": 1579317.0, + "reward": 5.75, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 2.598076105117798, + "step": 5341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 98.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01980489492416382, + "kl": 0.002585027366876602, + "learning_rate": 1.5530000000000002e-06, + "loss": 0.0001, + "num_tokens": 1579629.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 98.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031025422737002373, + "kl": 0.003615888301283121, + "learning_rate": 1.5526666666666668e-06, + "loss": 0.0002, + "num_tokens": 1579913.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 98.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02114158309996128, + "kl": 0.004775496083311737, + "learning_rate": 1.5523333333333336e-06, + "loss": 0.0002, + "num_tokens": 1580247.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 98.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07181570678949356, + "kl": 0.01126834750175476, + "learning_rate": 1.552e-06, + "loss": 0.0006, + "num_tokens": 1580521.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 99.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06870504468679428, + "kl": 0.010243732016533613, + "learning_rate": 1.5516666666666667e-06, + "loss": 0.0005, + "num_tokens": 1580848.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.75, + "completions/mean_terminated_length": 10.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 99.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24986718595027924, + "kl": 0.011883744155056775, + "learning_rate": 1.5513333333333333e-06, + "loss": 0.0006, + "num_tokens": 1581119.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 99.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053214896470308304, + "kl": 0.011505658272653818, + "learning_rate": 1.551e-06, + "loss": 0.0006, + "num_tokens": 1581413.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 33.25, + "completions/mean_terminated_length": 33.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 99.05555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.103551149368286, + "kl": 0.1987457387149334, + "learning_rate": 1.5506666666666666e-06, + "loss": 0.1555, + "num_tokens": 1581762.0, + "reward": 5.75, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 4.5, + "step": 5349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 99.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17072442173957825, + "kl": 0.021500190254300833, + "learning_rate": 1.5503333333333334e-06, + "loss": 0.0011, + "num_tokens": 1582096.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 99.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09728353470563889, + "kl": 0.027855553664267063, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.0014, + "num_tokens": 1582367.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 99.11111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.540256977081299, + "kl": 0.20179709047079086, + "learning_rate": 1.5496666666666668e-06, + "loss": -0.0031, + "num_tokens": 1582670.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 5352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 99.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05950898304581642, + "kl": 0.007303935009986162, + "learning_rate": 1.5493333333333335e-06, + "loss": 0.0004, + "num_tokens": 1583003.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 99.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04495961219072342, + "kl": 0.0029592177888844162, + "learning_rate": 1.549e-06, + "loss": 0.0001, + "num_tokens": 1583271.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 99.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015279130078852177, + "kl": 0.0010715940152294934, + "learning_rate": 1.5486666666666667e-06, + "loss": 0.0001, + "num_tokens": 1583539.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 34.75, + "completions/mean_terminated_length": 34.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 99.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06639955937862396, + "kl": 0.017613645642995834, + "learning_rate": 1.5483333333333333e-06, + "loss": 0.0011, + "num_tokens": 1583898.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 99.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02236146852374077, + "kl": 0.001061448361724615, + "learning_rate": 1.548e-06, + "loss": 0.0001, + "num_tokens": 1584167.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.5, + "completions/mean_terminated_length": 4.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 99.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04704895615577698, + "kl": 0.0005344539822544903, + "learning_rate": 1.5476666666666666e-06, + "loss": 0.0, + "num_tokens": 1584381.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 99.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05783044919371605, + "kl": 0.001563534140586853, + "learning_rate": 1.5473333333333334e-06, + "loss": 0.0001, + "num_tokens": 1584591.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 99.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09060274809598923, + "kl": 0.009187803603708744, + "learning_rate": 1.5470000000000002e-06, + "loss": 0.0005, + "num_tokens": 1584923.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 99.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18047532439231873, + "kl": 0.03346627578139305, + "learning_rate": 1.5466666666666668e-06, + "loss": 0.0017, + "num_tokens": 1585221.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 23.75, + "completions/mean_terminated_length": 23.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 99.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15399408340454102, + "kl": 0.03782643564045429, + "learning_rate": 1.5463333333333335e-06, + "loss": 0.0019, + "num_tokens": 1585544.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 99.31481481481481, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3413825035095215, + "kl": 0.05090123228728771, + "learning_rate": 1.5459999999999999e-06, + "loss": 0.0996, + "num_tokens": 1585894.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 5363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 99.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025552954524755478, + "kl": 0.002415069960989058, + "learning_rate": 1.5456666666666669e-06, + "loss": 0.0001, + "num_tokens": 1586154.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 99.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04145500436425209, + "kl": 0.005083933472633362, + "learning_rate": 1.5453333333333332e-06, + "loss": 0.0003, + "num_tokens": 1586370.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 99.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010239742696285248, + "kl": 0.00029083655681461096, + "learning_rate": 1.545e-06, + "loss": 0.0, + "num_tokens": 1586686.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 99.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05068647861480713, + "kl": 0.004774346947669983, + "learning_rate": 1.5446666666666666e-06, + "loss": 0.0002, + "num_tokens": 1586990.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 99.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018730225274339318, + "kl": 0.0003124594804830849, + "learning_rate": 1.5443333333333334e-06, + "loss": 0.0, + "num_tokens": 1587210.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 99.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011681350588332862, + "kl": 4.000961780548096e-06, + "learning_rate": 1.5440000000000002e-06, + "loss": 0.0, + "num_tokens": 1587430.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 5369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 99.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01794993132352829, + "kl": 0.0023926477879285812, + "learning_rate": 1.5436666666666667e-06, + "loss": 0.0001, + "num_tokens": 1587742.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 99.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02146325074136257, + "kl": 0.0062999005895107985, + "learning_rate": 1.5433333333333335e-06, + "loss": 0.0003, + "num_tokens": 1588010.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 99.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08506253361701965, + "kl": 0.01793564297258854, + "learning_rate": 1.5429999999999999e-06, + "loss": 0.001, + "num_tokens": 1588292.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 99.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055309124290943146, + "kl": 0.002217214263509959, + "learning_rate": 1.5426666666666669e-06, + "loss": 0.0001, + "num_tokens": 1588611.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 99.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05325845628976822, + "kl": 0.002905784174799919, + "learning_rate": 1.5423333333333332e-06, + "loss": 0.0001, + "num_tokens": 1588860.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.25, + "completions/mean_terminated_length": 6.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 99.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029673736542463303, + "kl": 0.001128291798522696, + "learning_rate": 1.542e-06, + "loss": 0.0001, + "num_tokens": 1589093.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 99.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022981755435466766, + "kl": 0.09596388041973114, + "learning_rate": 1.5416666666666666e-06, + "loss": 0.0048, + "num_tokens": 1589465.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 99.57407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.841578960418701, + "kl": 0.24588903784751892, + "learning_rate": 1.5413333333333334e-06, + "loss": 0.094, + "num_tokens": 1589781.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 5377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 99.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19184766709804535, + "kl": 0.015550390351563692, + "learning_rate": 1.5410000000000002e-06, + "loss": 0.0009, + "num_tokens": 1590083.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 99.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.066701889038086, + "kl": 0.007523627951741219, + "learning_rate": 1.5406666666666667e-06, + "loss": 0.033, + "num_tokens": 1590420.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 5379 + }, + { + "clip_ratio/high_max": 0.00657894741743803, + "clip_ratio/high_mean": 0.00657894741743803, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00657894741743803, + "completion_length": 38.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 38.0, + "completions/mean_terminated_length": 38.0, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 99.62962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.120488405227661, + "kl": 0.07349224388599396, + "learning_rate": 1.5403333333333335e-06, + "loss": 0.0053, + "num_tokens": 1590800.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 5380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 99.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007782155880704522, + "kl": 0.003766007721424103, + "learning_rate": 1.5399999999999999e-06, + "loss": 0.0002, + "num_tokens": 1591036.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 30.0, + "completions/mean_terminated_length": 30.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 99.66666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.380682945251465, + "kl": 0.12924740463495255, + "learning_rate": 1.5396666666666669e-06, + "loss": 0.1347, + "num_tokens": 1591392.0, + "reward": 6.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 6.5, + "rewards/reward_combined/std": 2.0, + "step": 5382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 99.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03189317137002945, + "kl": 0.001747717848047614, + "learning_rate": 1.5393333333333332e-06, + "loss": 0.0001, + "num_tokens": 1591695.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 99.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045159101486206055, + "kl": 0.007392449617327657, + "learning_rate": 1.539e-06, + "loss": 0.0004, + "num_tokens": 1591965.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 99.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013723954558372498, + "kl": 0.013721433002501726, + "learning_rate": 1.5386666666666666e-06, + "loss": 0.0007, + "num_tokens": 1592225.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 99.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15103651583194733, + "kl": 0.03804316185414791, + "learning_rate": 1.5383333333333334e-06, + "loss": 0.0019, + "num_tokens": 1592536.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 99.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16732698678970337, + "kl": 0.017702241544611752, + "learning_rate": 1.5380000000000001e-06, + "loss": 0.0009, + "num_tokens": 1592826.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 94.25, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 94.25, + "completions/mean_terminated_length": 40.333335876464844, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 99.77777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7212657928466797, + "kl": 0.030908092856407166, + "learning_rate": 1.5376666666666667e-06, + "loss": 0.3665, + "num_tokens": 1593427.0, + "reward": 5.425000190734863, + "reward_std": 4.149999618530273, + "rewards/reward_combined/mean": 5.425000190734863, + "rewards/reward_combined/std": 4.150000095367432, + "step": 5388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 99.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09785301238298416, + "kl": 0.013820950407534838, + "learning_rate": 1.5373333333333335e-06, + "loss": 0.0007, + "num_tokens": 1593719.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 99.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.9454638957977295, + "kl": 0.44942344445735216, + "learning_rate": 1.537e-06, + "loss": 0.0232, + "num_tokens": 1594008.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 99.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0752691999077797, + "kl": 0.07165481522679329, + "learning_rate": 1.5366666666666668e-06, + "loss": 0.0036, + "num_tokens": 1594302.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 99.85185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.373311042785645, + "kl": 0.015349007211625576, + "learning_rate": 1.5363333333333332e-06, + "loss": 0.1194, + "num_tokens": 1594631.0, + "reward": 5.75, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 4.5, + "step": 5392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 99.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02674560621380806, + "kl": 0.0014183521270751953, + "learning_rate": 1.536e-06, + "loss": 0.0001, + "num_tokens": 1594843.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 99.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023457389324903488, + "kl": 0.0003246545675210655, + "learning_rate": 1.5356666666666666e-06, + "loss": 0.0, + "num_tokens": 1595099.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 99.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5527740120887756, + "kl": 0.0588977187871933, + "learning_rate": 1.5353333333333333e-06, + "loss": 0.0036, + "num_tokens": 1595391.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 72.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 72.0, + "completions/mean_terminated_length": 10.666666984558105, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 99.92592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.405885100364685, + "kl": 0.015394964255392551, + "learning_rate": 1.5350000000000001e-06, + "loss": 0.4571, + "num_tokens": 1595899.0, + "reward": 6.300000190734863, + "reward_std": 2.4000000953674316, + "rewards/reward_combined/mean": 6.300000190734863, + "rewards/reward_combined/std": 2.3999998569488525, + "step": 5396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 99.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008264186908490956, + "kl": 0.0011939768446609378, + "learning_rate": 1.5346666666666667e-06, + "loss": 0.0001, + "num_tokens": 1596179.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 99.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005415427964180708, + "kl": 0.00031583383679389954, + "learning_rate": 1.5343333333333335e-06, + "loss": 0.0, + "num_tokens": 1596439.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 34.0, + "completions/mean_terminated_length": 34.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 99.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036006297916173935, + "kl": 0.031755766831338406, + "learning_rate": 1.534e-06, + "loss": 0.0015, + "num_tokens": 1596855.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 5399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 100.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08657006919384003, + "kl": 0.007303065387532115, + "learning_rate": 1.5336666666666668e-06, + "loss": 0.0003, + "num_tokens": 1597115.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 100.01851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.492645740509033, + "kl": 0.27295850962400436, + "learning_rate": 1.5333333333333332e-06, + "loss": 0.0208, + "num_tokens": 1597423.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 5401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 100.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037336818873882294, + "kl": 0.09904588013887405, + "learning_rate": 1.533e-06, + "loss": 0.005, + "num_tokens": 1597795.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 100.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03178287670016289, + "kl": 0.00783985760062933, + "learning_rate": 1.5326666666666665e-06, + "loss": 0.0004, + "num_tokens": 1598118.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 100.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08238870650529861, + "kl": 0.016373123042285442, + "learning_rate": 1.5323333333333333e-06, + "loss": 0.0008, + "num_tokens": 1598411.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 100.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07624446600675583, + "kl": 0.0022464243702415843, + "learning_rate": 1.5320000000000001e-06, + "loss": 0.0001, + "num_tokens": 1598668.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 100.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06564126163721085, + "kl": 0.006683208514004946, + "learning_rate": 1.5316666666666667e-06, + "loss": 0.0003, + "num_tokens": 1598995.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 100.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08014131337404251, + "kl": 0.017301190178841352, + "learning_rate": 1.5313333333333335e-06, + "loss": 0.0009, + "num_tokens": 1599301.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 100.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19261673092842102, + "kl": 0.0203330940566957, + "learning_rate": 1.531e-06, + "loss": 0.0011, + "num_tokens": 1599591.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 100.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031339049339294434, + "kl": 0.0016976014303509146, + "learning_rate": 1.5306666666666668e-06, + "loss": 0.0001, + "num_tokens": 1599810.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 100.18518518518519, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.382539987564087, + "kl": 0.04858838557265699, + "learning_rate": 1.5303333333333332e-06, + "loss": 0.1232, + "num_tokens": 1600154.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 5410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 38.5, + "completions/mean_terminated_length": 38.5, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 100.20370370370371, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7690751552581787, + "kl": 0.10870247334241867, + "learning_rate": 1.53e-06, + "loss": 0.0103, + "num_tokens": 1600536.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 5411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 100.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00813469011336565, + "kl": 0.0014763634535484016, + "learning_rate": 1.5296666666666665e-06, + "loss": 0.0001, + "num_tokens": 1600810.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 100.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.38398101925849915, + "kl": 0.03935919562354684, + "learning_rate": 1.5293333333333333e-06, + "loss": 0.0022, + "num_tokens": 1601113.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 100.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05523664131760597, + "kl": 0.0021810964099131525, + "learning_rate": 1.529e-06, + "loss": 0.0001, + "num_tokens": 1601379.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 100.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057254012674093246, + "kl": 0.01424642140045762, + "learning_rate": 1.5286666666666667e-06, + "loss": 0.0008, + "num_tokens": 1601653.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 100.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035123929381370544, + "kl": 0.0009035170078277588, + "learning_rate": 1.5283333333333335e-06, + "loss": 0.0, + "num_tokens": 1601861.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 100.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0074735041707754135, + "kl": 0.0020648986101150513, + "learning_rate": 1.528e-06, + "loss": 0.0001, + "num_tokens": 1602077.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 100.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010329063981771469, + "kl": 0.008671910502016544, + "learning_rate": 1.5276666666666668e-06, + "loss": 0.0004, + "num_tokens": 1602349.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 100.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005601013079285622, + "kl": 0.26794689893722534, + "learning_rate": 1.5273333333333332e-06, + "loss": 0.0134, + "num_tokens": 1602653.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 100.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03041202202439308, + "kl": 0.0010525789693929255, + "learning_rate": 1.5270000000000002e-06, + "loss": 0.0001, + "num_tokens": 1602970.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 100.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03022598661482334, + "kl": 0.003677847096696496, + "learning_rate": 1.5266666666666665e-06, + "loss": 0.0002, + "num_tokens": 1603254.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 100.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005837862379848957, + "kl": 0.00020234286785125732, + "learning_rate": 1.5263333333333333e-06, + "loss": 0.0, + "num_tokens": 1603466.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 100.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.27575093507766724, + "kl": 0.021130628883838654, + "learning_rate": 1.526e-06, + "loss": 0.001, + "num_tokens": 1603801.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 100.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011377870105206966, + "kl": 0.0034642955870367587, + "learning_rate": 1.5256666666666667e-06, + "loss": 0.0001, + "num_tokens": 1604099.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 100.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00850125215947628, + "kl": 0.00019849191448884085, + "learning_rate": 1.5253333333333334e-06, + "loss": 0.0, + "num_tokens": 1604367.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 100.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004798910580575466, + "kl": 0.0003161365748383105, + "learning_rate": 1.525e-06, + "loss": 0.0, + "num_tokens": 1604681.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 33.5, + "completions/mean_terminated_length": 33.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 100.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05713745951652527, + "kl": 0.019365067593753338, + "learning_rate": 1.5246666666666668e-06, + "loss": 0.001, + "num_tokens": 1605039.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 100.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030812595039606094, + "kl": 0.05632269196212292, + "learning_rate": 1.5243333333333332e-06, + "loss": 0.0028, + "num_tokens": 1605332.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 100.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0022928824182599783, + "kl": 0.0015261415392160416, + "learning_rate": 1.5240000000000001e-06, + "loss": 0.0001, + "num_tokens": 1605644.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 100.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002207911340519786, + "kl": 6.789564940845594e-05, + "learning_rate": 1.5236666666666665e-06, + "loss": 0.0, + "num_tokens": 1605904.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 100.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07424936443567276, + "kl": 0.009732466656714678, + "learning_rate": 1.5233333333333333e-06, + "loss": 0.0005, + "num_tokens": 1606195.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 100.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05747784674167633, + "kl": 0.008547557983547449, + "learning_rate": 1.523e-06, + "loss": 0.0004, + "num_tokens": 1606479.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 26.25, + "completions/mean_terminated_length": 26.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 100.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031948383897542953, + "kl": 0.025077415630221367, + "learning_rate": 1.5226666666666666e-06, + "loss": 0.001, + "num_tokens": 1606804.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 100.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02988605387508869, + "kl": 0.0009293212206102908, + "learning_rate": 1.5223333333333334e-06, + "loss": 0.0, + "num_tokens": 1607074.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 28.5, + "completions/mean_terminated_length": 28.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 100.64814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.043994903564453, + "kl": 0.031462740153074265, + "learning_rate": 1.522e-06, + "loss": 0.1026, + "num_tokens": 1607408.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 5435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 7.0, + "completions/mean_terminated_length": 7.0, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 100.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007574048358947039, + "kl": 0.0003867149353027344, + "learning_rate": 1.5216666666666668e-06, + "loss": 0.0, + "num_tokens": 1607644.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 100.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14587850868701935, + "kl": 0.041430942714214325, + "learning_rate": 1.5213333333333331e-06, + "loss": 0.002, + "num_tokens": 1607988.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 100.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0616946816444397, + "kl": 0.0038246663461904973, + "learning_rate": 1.5210000000000001e-06, + "loss": 0.0002, + "num_tokens": 1608254.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 100.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014803546480834484, + "kl": 0.0016542524099349976, + "learning_rate": 1.520666666666667e-06, + "loss": 0.0001, + "num_tokens": 1608514.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 100.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19577500224113464, + "kl": 0.006077451631426811, + "learning_rate": 1.5203333333333333e-06, + "loss": 0.0003, + "num_tokens": 1608788.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.5, + "completions/mean_terminated_length": 7.5, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 100.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012572052888572216, + "kl": 0.00023206590049085207, + "learning_rate": 1.5200000000000003e-06, + "loss": 0.0, + "num_tokens": 1609030.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 23.25, + "completions/mean_terminated_length": 23.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 100.77777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.121943950653076, + "kl": 0.029760083183646202, + "learning_rate": 1.5196666666666666e-06, + "loss": 0.1309, + "num_tokens": 1609351.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 5442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 100.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007620318792760372, + "kl": 0.0037682130932807922, + "learning_rate": 1.5193333333333334e-06, + "loss": 0.0002, + "num_tokens": 1609587.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 100.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010014322586357594, + "kl": 0.0401719119399786, + "learning_rate": 1.519e-06, + "loss": 0.002, + "num_tokens": 1609992.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 5444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 100.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027753768488764763, + "kl": 0.0012380480766296387, + "learning_rate": 1.5186666666666668e-06, + "loss": 0.0001, + "num_tokens": 1610208.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 100.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003488502698019147, + "kl": 0.00019267946481704712, + "learning_rate": 1.5183333333333333e-06, + "loss": 0.0, + "num_tokens": 1610428.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 5446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 100.87037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.728623628616333, + "kl": 0.017682242207229137, + "learning_rate": 1.5180000000000001e-06, + "loss": 0.0281, + "num_tokens": 1610689.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 5447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 100.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021002013236284256, + "kl": 0.0006055720150470734, + "learning_rate": 1.517666666666667e-06, + "loss": 0.0, + "num_tokens": 1610949.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 26.25, + "completions/mean_terminated_length": 26.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 100.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09916306287050247, + "kl": 0.01735564274713397, + "learning_rate": 1.5173333333333333e-06, + "loss": 0.0009, + "num_tokens": 1611282.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 100.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029490530490875244, + "kl": 0.0021016259561292827, + "learning_rate": 1.5170000000000003e-06, + "loss": 0.0001, + "num_tokens": 1611578.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.006172839552164078, + "clip_ratio/low_min": 0.006172839552164078, + "clip_ratio/region_mean": 0.006172839552164078, + "completion_length": 32.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 32.25, + "completions/mean_terminated_length": 32.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 100.94444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9005918502807617, + "kl": 0.06473296135663986, + "learning_rate": 1.5166666666666666e-06, + "loss": 0.2028, + "num_tokens": 1611943.0, + "reward": 6.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 6.5, + "rewards/reward_combined/std": 2.0, + "step": 5451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 28.75, + "completions/mean_terminated_length": 28.75, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 100.96296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7278189659118652, + "kl": 0.07189453579485416, + "learning_rate": 1.5163333333333334e-06, + "loss": 0.0153, + "num_tokens": 1612274.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 5452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 100.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20069493353366852, + "kl": 0.03770335204899311, + "learning_rate": 1.516e-06, + "loss": 0.0019, + "num_tokens": 1612546.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 101.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8044025897979736, + "kl": 0.10362424701452255, + "learning_rate": 1.5156666666666668e-06, + "loss": 0.0066, + "num_tokens": 1612826.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 101.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19437113404273987, + "kl": 0.030902760103344917, + "learning_rate": 1.5153333333333333e-06, + "loss": 0.0016, + "num_tokens": 1613151.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 101.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0020378935150802135, + "kl": 3.0475853236566763e-05, + "learning_rate": 1.5150000000000001e-06, + "loss": 0.0, + "num_tokens": 1613411.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 27.0, + "completions/mean_terminated_length": 27.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 101.05555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.0847392082214355, + "kl": 0.07134226709604263, + "learning_rate": 1.5146666666666669e-06, + "loss": 0.1019, + "num_tokens": 1613755.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 5457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 101.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051703307777643204, + "kl": 0.014781441539525986, + "learning_rate": 1.5143333333333332e-06, + "loss": 0.0008, + "num_tokens": 1614037.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 34.0, + "completions/mean_terminated_length": 34.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 101.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2231585830450058, + "kl": 0.019413352943956852, + "learning_rate": 1.5140000000000002e-06, + "loss": 0.001, + "num_tokens": 1614393.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 101.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06328113377094269, + "kl": 0.005902788136154413, + "learning_rate": 1.5136666666666666e-06, + "loss": 0.0003, + "num_tokens": 1614697.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 101.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025488771498203278, + "kl": 0.0013407915830612183, + "learning_rate": 1.5133333333333334e-06, + "loss": 0.0001, + "num_tokens": 1614909.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 101.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028874771669507027, + "kl": 0.00027207285165786743, + "learning_rate": 1.513e-06, + "loss": 0.0, + "num_tokens": 1615121.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 101.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057352304458618164, + "kl": 0.02362719837401528, + "learning_rate": 1.5126666666666667e-06, + "loss": 0.0012, + "num_tokens": 1615408.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 37.5, + "completions/mean_terminated_length": 37.5, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 101.18518518518519, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1560592651367188, + "kl": 0.024163642898201942, + "learning_rate": 1.5123333333333333e-06, + "loss": 0.0745, + "num_tokens": 1615782.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 5464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 101.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021252160891890526, + "kl": 0.006311272969469428, + "learning_rate": 1.512e-06, + "loss": 0.0003, + "num_tokens": 1616050.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 101.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025522353127598763, + "kl": 0.0011507653980515897, + "learning_rate": 1.5116666666666669e-06, + "loss": 0.0001, + "num_tokens": 1616362.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 101.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016603050753474236, + "kl": 0.0029182470170781016, + "learning_rate": 1.5113333333333334e-06, + "loss": 0.0001, + "num_tokens": 1616690.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 101.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07513971626758575, + "kl": 0.006240957882255316, + "learning_rate": 1.5110000000000002e-06, + "loss": 0.0003, + "num_tokens": 1616964.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 101.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013259727507829666, + "kl": 0.0006940088205737993, + "learning_rate": 1.5106666666666666e-06, + "loss": 0.0, + "num_tokens": 1617199.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 101.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14239011704921722, + "kl": 0.04465408995747566, + "learning_rate": 1.5103333333333334e-06, + "loss": 0.0022, + "num_tokens": 1617532.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 101.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0035039009526371956, + "kl": 0.04595787823200226, + "learning_rate": 1.51e-06, + "loss": 0.0023, + "num_tokens": 1617936.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 5471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 101.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07039260864257812, + "kl": 0.015040764585137367, + "learning_rate": 1.5096666666666667e-06, + "loss": 0.0008, + "num_tokens": 1618258.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 101.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.7471505745779723e-05, + "kl": 3.255903720855713e-06, + "learning_rate": 1.5093333333333333e-06, + "loss": 0.0, + "num_tokens": 1618478.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 5473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 101.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00926927849650383, + "kl": 0.0001908615231513977, + "learning_rate": 1.509e-06, + "loss": 0.0, + "num_tokens": 1618686.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 101.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3001217246055603, + "kl": 0.041287238942459226, + "learning_rate": 1.5086666666666669e-06, + "loss": 0.0023, + "num_tokens": 1618975.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 101.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18405325710773468, + "kl": 0.024043050594627857, + "learning_rate": 1.5083333333333334e-06, + "loss": 0.0015, + "num_tokens": 1619263.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 101.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034434910863637924, + "kl": 0.09782170876860619, + "learning_rate": 1.5080000000000002e-06, + "loss": 0.0049, + "num_tokens": 1619635.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 101.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050050657242536545, + "kl": 0.0026265646229148842, + "learning_rate": 1.5076666666666666e-06, + "loss": 0.0001, + "num_tokens": 1619909.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 101.46296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.890697002410889, + "kl": 0.012026506941765547, + "learning_rate": 1.5073333333333334e-06, + "loss": -0.0367, + "num_tokens": 1620203.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 5479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 101.48148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.481655597686768, + "kl": 0.03292984934523702, + "learning_rate": 1.507e-06, + "loss": 0.0584, + "num_tokens": 1620512.0, + "reward": 6.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 6.5, + "rewards/reward_combined/std": 2.0, + "step": 5480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 101.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009642007760703564, + "kl": 0.00012345909635769203, + "learning_rate": 1.5066666666666667e-06, + "loss": 0.0, + "num_tokens": 1620768.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 101.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0526387020945549, + "kl": 0.0031842728203628212, + "learning_rate": 1.5063333333333333e-06, + "loss": 0.0002, + "num_tokens": 1621066.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 101.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008404916152358055, + "kl": 0.0022246912121772766, + "learning_rate": 1.506e-06, + "loss": 0.0001, + "num_tokens": 1621282.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 101.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026012161746621132, + "kl": 0.16249799728393555, + "learning_rate": 1.5056666666666668e-06, + "loss": 0.0081, + "num_tokens": 1621591.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.75, + "completions/mean_terminated_length": 10.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 101.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025419604033231735, + "kl": 0.001505140564404428, + "learning_rate": 1.5053333333333334e-06, + "loss": 0.0001, + "num_tokens": 1621862.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 101.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008624795591458678, + "kl": 0.0037489011883735657, + "learning_rate": 1.5050000000000002e-06, + "loss": 0.0002, + "num_tokens": 1622098.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 101.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.526792049407959, + "kl": 0.44115081103518605, + "learning_rate": 1.5046666666666666e-06, + "loss": 0.0232, + "num_tokens": 1622359.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 101.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009270765585824847, + "kl": 0.0011186195188201964, + "learning_rate": 1.5043333333333333e-06, + "loss": 0.0001, + "num_tokens": 1622639.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 47.0, + "completions/max_terminated_length": 47.0, + "completions/mean_length": 37.5, + "completions/mean_terminated_length": 37.5, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 101.64814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5725700855255127, + "kl": 0.11916254088282585, + "learning_rate": 1.504e-06, + "loss": -0.0777, + "num_tokens": 1623005.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 5489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 101.66666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.934995412826538, + "kl": 0.03453459311276674, + "learning_rate": 1.5036666666666667e-06, + "loss": 0.0188, + "num_tokens": 1623297.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 5490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 101.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07513511925935745, + "kl": 0.0034808366326615214, + "learning_rate": 1.5033333333333333e-06, + "loss": 0.0002, + "num_tokens": 1623563.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 101.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11513255536556244, + "kl": 0.03979503735899925, + "learning_rate": 1.503e-06, + "loss": 0.002, + "num_tokens": 1623866.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 9.75, + "completions/mean_terminated_length": 9.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 101.72222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.0089430809021, + "kl": 0.012514011934399605, + "learning_rate": 1.5026666666666668e-06, + "loss": 0.1547, + "num_tokens": 1624133.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 5493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 101.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.9709498286247253, + "kl": 0.3347504287958145, + "learning_rate": 1.5023333333333334e-06, + "loss": 0.0168, + "num_tokens": 1624438.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 101.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037702690809965134, + "kl": 0.0046277036890387535, + "learning_rate": 1.5020000000000002e-06, + "loss": 0.0002, + "num_tokens": 1624734.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 101.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13751274347305298, + "kl": 0.012181914178654552, + "learning_rate": 1.5016666666666665e-06, + "loss": 0.0008, + "num_tokens": 1625010.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 101.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031027279794216156, + "kl": 0.0036498650442808867, + "learning_rate": 1.5013333333333335e-06, + "loss": 0.0002, + "num_tokens": 1625294.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 101.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0700439065694809, + "kl": 0.009463974740356207, + "learning_rate": 1.501e-06, + "loss": 0.0005, + "num_tokens": 1625560.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 101.83333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2186272144317627, + "kl": 0.042012871243059635, + "learning_rate": 1.5006666666666667e-06, + "loss": 0.027, + "num_tokens": 1625892.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 5499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 101.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0022304037120193243, + "kl": 0.0001068115234375, + "learning_rate": 1.5003333333333333e-06, + "loss": 0.0, + "num_tokens": 1626136.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 101.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02951614186167717, + "kl": 0.004512539831921458, + "learning_rate": 1.5e-06, + "loss": 0.0002, + "num_tokens": 1626424.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 101.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035995204001665115, + "kl": 0.003464370034635067, + "learning_rate": 1.4996666666666666e-06, + "loss": 0.0002, + "num_tokens": 1626736.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 101.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01149026583880186, + "kl": 0.0004345825727796182, + "learning_rate": 1.4993333333333334e-06, + "loss": 0.0, + "num_tokens": 1627054.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 35.25, + "completions/mean_terminated_length": 35.25, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 101.92592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.236963987350464, + "kl": 0.07776482030749321, + "learning_rate": 1.499e-06, + "loss": -0.0867, + "num_tokens": 1627423.0, + "reward": 7.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.875, + "rewards/reward_combined/std": 0.25, + "step": 5504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 101.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055479470640420914, + "kl": 0.011052647139877081, + "learning_rate": 1.4986666666666665e-06, + "loss": 0.0005, + "num_tokens": 1627748.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.5, + "completions/mean_terminated_length": 4.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 101.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021672671660780907, + "kl": 0.0016036659362725914, + "learning_rate": 1.4983333333333335e-06, + "loss": 0.0001, + "num_tokens": 1627966.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 101.98148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.020152568817139, + "kl": 0.04813184216618538, + "learning_rate": 1.498e-06, + "loss": -0.3337, + "num_tokens": 1628278.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 5507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 102.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006351388059556484, + "kl": 0.00030978521681390703, + "learning_rate": 1.4976666666666667e-06, + "loss": 0.0, + "num_tokens": 1628550.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 102.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04865812882781029, + "kl": 0.03741905279457569, + "learning_rate": 1.4973333333333335e-06, + "loss": 0.0019, + "num_tokens": 1628822.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 33.0, + "completions/mean_terminated_length": 33.0, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 102.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022691408172249794, + "kl": 0.017181613482534885, + "learning_rate": 1.497e-06, + "loss": 0.0009, + "num_tokens": 1629178.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 102.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03642177954316139, + "kl": 0.0017040020320564508, + "learning_rate": 1.4966666666666666e-06, + "loss": 0.0001, + "num_tokens": 1629449.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 102.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01144977193325758, + "kl": 0.00036843400448560715, + "learning_rate": 1.4963333333333334e-06, + "loss": 0.0, + "num_tokens": 1629770.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 102.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02119687758386135, + "kl": 0.09590749070048332, + "learning_rate": 1.496e-06, + "loss": 0.0048, + "num_tokens": 1630142.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 102.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0036402358673512936, + "kl": 0.0003831803915090859, + "learning_rate": 1.4956666666666667e-06, + "loss": 0.0, + "num_tokens": 1630362.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 102.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038713980466127396, + "kl": 0.16312626004219055, + "learning_rate": 1.4953333333333335e-06, + "loss": 0.0082, + "num_tokens": 1630671.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 102.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00778679084032774, + "kl": 0.0016133278841152787, + "learning_rate": 1.495e-06, + "loss": 0.0001, + "num_tokens": 1630931.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 102.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05482223629951477, + "kl": 0.006359761813655496, + "learning_rate": 1.4946666666666667e-06, + "loss": 0.0003, + "num_tokens": 1631201.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 40.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 40.0, + "completions/mean_terminated_length": 40.0, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 102.18518518518519, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4725849628448486, + "kl": 0.07971523702144623, + "learning_rate": 1.4943333333333334e-06, + "loss": -0.0011, + "num_tokens": 1631589.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 5518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 102.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06893176585435867, + "kl": 0.003924438817193732, + "learning_rate": 1.494e-06, + "loss": 0.0002, + "num_tokens": 1631859.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 102.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021440302953124046, + "kl": 0.0018459950806573033, + "learning_rate": 1.4936666666666666e-06, + "loss": 0.0001, + "num_tokens": 1632155.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 102.24074074074075, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.42820930480957, + "kl": 0.025614461861550808, + "learning_rate": 1.4933333333333334e-06, + "loss": 0.2842, + "num_tokens": 1632470.0, + "reward": 6.125, + "reward_std": 3.75, + "rewards/reward_combined/mean": 6.125, + "rewards/reward_combined/std": 3.75, + "step": 5521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 102.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0855054184794426, + "kl": 0.014258846058510244, + "learning_rate": 1.493e-06, + "loss": 0.0007, + "num_tokens": 1632794.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 102.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09472135454416275, + "kl": 0.004273287137039006, + "learning_rate": 1.4926666666666667e-06, + "loss": 0.0002, + "num_tokens": 1633064.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 102.29629629629629, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9774796962738037, + "kl": 0.05759480409324169, + "learning_rate": 1.4923333333333335e-06, + "loss": 0.0005, + "num_tokens": 1633392.0, + "reward": 4.5, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 4.041451930999756, + "step": 5524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 31.5, + "completions/mean_terminated_length": 31.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 102.31481481481481, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1560639142990112, + "kl": 0.4193668905645609, + "learning_rate": 1.492e-06, + "loss": 0.0385, + "num_tokens": 1633798.0, + "reward": 2.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.25, + "rewards/reward_combined/std": 1.5, + "step": 5525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 102.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018021684139966965, + "kl": 0.0005127191543579102, + "learning_rate": 1.4916666666666666e-06, + "loss": 0.0, + "num_tokens": 1634010.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 102.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020977525040507317, + "kl": 0.005387601675465703, + "learning_rate": 1.4913333333333334e-06, + "loss": 0.0003, + "num_tokens": 1634278.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 102.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07715736329555511, + "kl": 0.009784580208361149, + "learning_rate": 1.491e-06, + "loss": 0.0005, + "num_tokens": 1634596.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 102.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053244467824697495, + "kl": 0.002207259414717555, + "learning_rate": 1.4906666666666668e-06, + "loss": 0.0001, + "num_tokens": 1634845.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 102.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10468082129955292, + "kl": 0.004478586371988058, + "learning_rate": 1.4903333333333334e-06, + "loss": 0.0003, + "num_tokens": 1635055.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.25, + "completions/mean_terminated_length": 6.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 102.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025577636435627937, + "kl": 0.0006268088181968778, + "learning_rate": 1.49e-06, + "loss": 0.0, + "num_tokens": 1635288.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 102.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018125461414456367, + "kl": 0.05175800621509552, + "learning_rate": 1.4896666666666667e-06, + "loss": 0.0026, + "num_tokens": 1635620.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 102.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057896487414836884, + "kl": 0.010228496976196766, + "learning_rate": 1.4893333333333335e-06, + "loss": 0.0005, + "num_tokens": 1635880.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 102.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0621013268828392, + "kl": 0.006157606840133667, + "learning_rate": 1.489e-06, + "loss": 0.0003, + "num_tokens": 1636096.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 102.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06662620604038239, + "kl": 0.009493907913565636, + "learning_rate": 1.4886666666666666e-06, + "loss": 0.0005, + "num_tokens": 1636430.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 102.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0063230544328689575, + "kl": 0.0003096287546213716, + "learning_rate": 1.4883333333333334e-06, + "loss": 0.0, + "num_tokens": 1636702.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 102.53703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.3630194664001465, + "kl": 0.02920142188668251, + "learning_rate": 1.488e-06, + "loss": 0.0441, + "num_tokens": 1637006.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 5537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 102.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23937015235424042, + "kl": 0.03838458959944546, + "learning_rate": 1.4876666666666668e-06, + "loss": 0.002, + "num_tokens": 1637294.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 102.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036702681332826614, + "kl": 0.0050619977992028, + "learning_rate": 1.4873333333333333e-06, + "loss": 0.0002, + "num_tokens": 1637591.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 102.5925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.224764108657837, + "kl": 0.18924957513809204, + "learning_rate": 1.487e-06, + "loss": 0.0496, + "num_tokens": 1637882.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 5540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 102.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028212007135152817, + "kl": 0.005184866953641176, + "learning_rate": 1.4866666666666667e-06, + "loss": 0.0003, + "num_tokens": 1638173.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 102.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010608435608446598, + "kl": 0.008286285679787397, + "learning_rate": 1.4863333333333335e-06, + "loss": 0.0004, + "num_tokens": 1638445.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 102.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.6796929634874687e-05, + "kl": 2.518296241760254e-06, + "learning_rate": 1.486e-06, + "loss": 0.0, + "num_tokens": 1638665.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 5543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 102.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008221440366469324, + "kl": 0.003745429217815399, + "learning_rate": 1.4856666666666668e-06, + "loss": 0.0002, + "num_tokens": 1638901.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 102.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02928253449499607, + "kl": 0.0031400781590491533, + "learning_rate": 1.4853333333333334e-06, + "loss": 0.0002, + "num_tokens": 1639185.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 102.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017769435420632362, + "kl": 0.0002772510051727295, + "learning_rate": 1.485e-06, + "loss": 0.0, + "num_tokens": 1639441.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 102.72222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.749978542327881, + "kl": 0.02927328087389469, + "learning_rate": 1.4846666666666668e-06, + "loss": 0.1456, + "num_tokens": 1639755.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 5547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 102.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004367906134575605, + "kl": 0.00016733705706428736, + "learning_rate": 1.4843333333333333e-06, + "loss": 0.0, + "num_tokens": 1640015.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 102.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06579401344060898, + "kl": 0.03201588336378336, + "learning_rate": 1.484e-06, + "loss": 0.0017, + "num_tokens": 1640361.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 102.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021259495988488197, + "kl": 0.008463053498417139, + "learning_rate": 1.4836666666666667e-06, + "loss": 0.0004, + "num_tokens": 1640622.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 102.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021426580846309662, + "kl": 0.002101754449540749, + "learning_rate": 1.4833333333333335e-06, + "loss": 0.0001, + "num_tokens": 1640910.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 102.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05924432352185249, + "kl": 0.001211017370223999, + "learning_rate": 1.483e-06, + "loss": 0.0001, + "num_tokens": 1641123.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 102.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04636835306882858, + "kl": 0.002629845286719501, + "learning_rate": 1.4826666666666668e-06, + "loss": 0.0001, + "num_tokens": 1641427.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 102.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025436658412218094, + "kl": 0.018649504985660315, + "learning_rate": 1.4823333333333334e-06, + "loss": 0.001, + "num_tokens": 1641719.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 24.25, + "completions/mean_terminated_length": 24.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 102.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16768330335617065, + "kl": 0.05305645242333412, + "learning_rate": 1.482e-06, + "loss": 0.0027, + "num_tokens": 1642036.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 102.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058733418583869934, + "kl": 0.008151871152222157, + "learning_rate": 1.4816666666666667e-06, + "loss": 0.0004, + "num_tokens": 1642352.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 102.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005205719266086817, + "kl": 0.00038047814450692385, + "learning_rate": 1.4813333333333333e-06, + "loss": 0.0, + "num_tokens": 1642666.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 29.75, + "completions/mean_terminated_length": 29.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 102.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12997817993164062, + "kl": 0.07159293070435524, + "learning_rate": 1.4809999999999999e-06, + "loss": 0.0035, + "num_tokens": 1643001.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 102.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03440767526626587, + "kl": 0.01249383483082056, + "learning_rate": 1.4806666666666669e-06, + "loss": 0.0007, + "num_tokens": 1643315.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 102.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048427026718854904, + "kl": 0.0012154094874858856, + "learning_rate": 1.4803333333333334e-06, + "loss": 0.0001, + "num_tokens": 1643575.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 102.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006804644130170345, + "kl": 0.2676941752433777, + "learning_rate": 1.48e-06, + "loss": 0.0134, + "num_tokens": 1643879.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 103.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05653773993253708, + "kl": 0.01362143037840724, + "learning_rate": 1.4796666666666668e-06, + "loss": 0.0007, + "num_tokens": 1644171.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 103.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02322981134057045, + "kl": 0.010478208772838116, + "learning_rate": 1.4793333333333334e-06, + "loss": 0.0005, + "num_tokens": 1644431.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 37.5, + "completions/mean_terminated_length": 37.5, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 103.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0641045868396759, + "kl": 0.0728888213634491, + "learning_rate": 1.479e-06, + "loss": 0.0037, + "num_tokens": 1644809.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 29.0, + "completions/mean_terminated_length": 29.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 103.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05591088905930519, + "kl": 0.015923491679131985, + "learning_rate": 1.4786666666666667e-06, + "loss": 0.0008, + "num_tokens": 1645153.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 103.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016792595386505127, + "kl": 0.0011760814231820405, + "learning_rate": 1.4783333333333333e-06, + "loss": 0.0001, + "num_tokens": 1645477.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 103.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04982414469122887, + "kl": 0.003077237866818905, + "learning_rate": 1.4779999999999999e-06, + "loss": 0.0002, + "num_tokens": 1645789.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 103.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12345301359891891, + "kl": 0.006383342933986569, + "learning_rate": 1.4776666666666669e-06, + "loss": 0.0003, + "num_tokens": 1646063.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 103.12962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.352280616760254, + "kl": 0.005991955986246467, + "learning_rate": 1.4773333333333334e-06, + "loss": 0.0247, + "num_tokens": 1646346.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 5569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 103.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19035136699676514, + "kl": 0.03718606033362448, + "learning_rate": 1.477e-06, + "loss": 0.0019, + "num_tokens": 1646635.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 28.25, + "completions/mean_terminated_length": 28.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 103.16666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1458001136779785, + "kl": 0.17720242589712143, + "learning_rate": 1.4766666666666668e-06, + "loss": 0.074, + "num_tokens": 1646984.0, + "reward": 5.25, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 2.598076105117798, + "step": 5571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 103.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04590500146150589, + "kl": 0.007351910462602973, + "learning_rate": 1.4763333333333334e-06, + "loss": 0.0004, + "num_tokens": 1647301.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 103.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009573576389811933, + "kl": 0.003726080060005188, + "learning_rate": 1.476e-06, + "loss": 0.0002, + "num_tokens": 1647537.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 103.22222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8554811477661133, + "kl": 0.22396749258041382, + "learning_rate": 1.4756666666666667e-06, + "loss": 0.0387, + "num_tokens": 1647847.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 5574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 12.25, + "completions/mean_terminated_length": 12.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 103.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0559222511947155, + "kl": 0.0165237532928586, + "learning_rate": 1.4753333333333333e-06, + "loss": 0.0009, + "num_tokens": 1648120.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 103.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011249528266489506, + "kl": 0.0005452483892440796, + "learning_rate": 1.4749999999999999e-06, + "loss": 0.0, + "num_tokens": 1648380.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 103.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02432558685541153, + "kl": 0.0007009912806097418, + "learning_rate": 1.4746666666666668e-06, + "loss": 0.0, + "num_tokens": 1648683.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 2.5, + "completions/mean_terminated_length": 2.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 103.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013705465942621231, + "kl": 0.0003001019358634949, + "learning_rate": 1.4743333333333334e-06, + "loss": 0.0, + "num_tokens": 1648889.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 103.31481481481481, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.446242809295654, + "kl": 0.019625958055257797, + "learning_rate": 1.474e-06, + "loss": 0.0071, + "num_tokens": 1649133.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 5579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 103.33333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.091035842895508, + "kl": 0.024523595813661814, + "learning_rate": 1.4736666666666668e-06, + "loss": -0.0362, + "num_tokens": 1649403.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 5580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 103.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005106464494019747, + "kl": 0.00025747418112587184, + "learning_rate": 1.4733333333333333e-06, + "loss": 0.0, + "num_tokens": 1649663.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 103.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026800744235515594, + "kl": 0.0011164993047714233, + "learning_rate": 1.473e-06, + "loss": 0.0001, + "num_tokens": 1649879.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 103.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03890179097652435, + "kl": 0.0009979253518395126, + "learning_rate": 1.4726666666666667e-06, + "loss": 0.0, + "num_tokens": 1650175.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 103.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21989014744758606, + "kl": 0.03462527133524418, + "learning_rate": 1.4723333333333333e-06, + "loss": 0.0017, + "num_tokens": 1650482.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 103.42592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3923582434654236, + "kl": 0.3532719388604164, + "learning_rate": 1.472e-06, + "loss": -0.0015, + "num_tokens": 1650852.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 5585 + }, + { + "clip_ratio/high_max": 0.007352941203862429, + "clip_ratio/high_mean": 0.007352941203862429, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007352941203862429, + "completion_length": 35.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 35.0, + "completions/mean_terminated_length": 35.0, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 103.44444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.079900026321411, + "kl": 0.1506493017077446, + "learning_rate": 1.4716666666666668e-06, + "loss": 0.0183, + "num_tokens": 1651208.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 5586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 103.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014649950026068836, + "kl": 4.5746564865112305e-06, + "learning_rate": 1.4713333333333334e-06, + "loss": 0.0, + "num_tokens": 1651428.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 5587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 103.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024087300524115562, + "kl": 0.0020767542300745845, + "learning_rate": 1.471e-06, + "loss": 0.0001, + "num_tokens": 1651712.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 103.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.912696838378906, + "kl": 0.15406838431954384, + "learning_rate": 1.4706666666666668e-06, + "loss": 0.0229, + "num_tokens": 1652038.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 5589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 103.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0738273486495018, + "kl": 0.0131396206561476, + "learning_rate": 1.4703333333333333e-06, + "loss": 0.0007, + "num_tokens": 1652328.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 103.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14401409029960632, + "kl": 0.024686934426426888, + "learning_rate": 1.4700000000000001e-06, + "loss": 0.0013, + "num_tokens": 1652614.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 103.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05650556460022926, + "kl": 0.006409379479009658, + "learning_rate": 1.4696666666666667e-06, + "loss": 0.0004, + "num_tokens": 1652893.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 103.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031053029000759125, + "kl": 0.0013121436059009284, + "learning_rate": 1.4693333333333333e-06, + "loss": 0.0001, + "num_tokens": 1653201.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 103.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004150730557739735, + "kl": 0.0003677785425679758, + "learning_rate": 1.469e-06, + "loss": 0.0, + "num_tokens": 1653421.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 103.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09290174394845963, + "kl": 0.04169721156358719, + "learning_rate": 1.4686666666666668e-06, + "loss": 0.0021, + "num_tokens": 1653717.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 103.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0070632826536893845, + "kl": 0.2676911950111389, + "learning_rate": 1.4683333333333334e-06, + "loss": 0.0134, + "num_tokens": 1654021.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 103.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008437030017375946, + "kl": 0.0005081224953755736, + "learning_rate": 1.468e-06, + "loss": 0.0, + "num_tokens": 1654256.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 103.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18121017515659332, + "kl": 0.03690602537244558, + "learning_rate": 1.4676666666666667e-06, + "loss": 0.0018, + "num_tokens": 1654558.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 103.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059240154922008514, + "kl": 0.03684787265956402, + "learning_rate": 1.4673333333333333e-06, + "loss": 0.0019, + "num_tokens": 1654869.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 103.70370370370371, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1925406455993652, + "kl": 0.04431818501325324, + "learning_rate": 1.467e-06, + "loss": -0.0011, + "num_tokens": 1655129.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 5600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 103.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022968411445617676, + "kl": 0.0036716292379423976, + "learning_rate": 1.4666666666666667e-06, + "loss": 0.0002, + "num_tokens": 1655459.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.011363636702299118, + "clip_ratio/low_min": 0.011363636702299118, + "clip_ratio/region_mean": 0.011363636702299118, + "completion_length": 24.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 24.75, + "completions/mean_terminated_length": 24.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 103.74074074074075, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.631124973297119, + "kl": 0.04231046140193939, + "learning_rate": 1.4663333333333332e-06, + "loss": 0.1384, + "num_tokens": 1655810.0, + "reward": 5.25, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 2.598076105117798, + "step": 5602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 103.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05637514591217041, + "kl": 0.006388418842107058, + "learning_rate": 1.466e-06, + "loss": 0.0003, + "num_tokens": 1656143.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 103.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042977165430784225, + "kl": 0.004595709848217666, + "learning_rate": 1.4656666666666668e-06, + "loss": 0.0002, + "num_tokens": 1656411.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 103.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1270960420370102, + "kl": 0.008030398981645703, + "learning_rate": 1.4653333333333334e-06, + "loss": 0.0004, + "num_tokens": 1656680.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 103.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04916525259613991, + "kl": 0.001803569495677948, + "learning_rate": 1.4650000000000002e-06, + "loss": 0.0001, + "num_tokens": 1656948.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 103.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03579654544591904, + "kl": 0.004255052888765931, + "learning_rate": 1.4646666666666667e-06, + "loss": 0.0002, + "num_tokens": 1657241.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 103.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06768617779016495, + "kl": 0.002009347837883979, + "learning_rate": 1.4643333333333333e-06, + "loss": 0.0001, + "num_tokens": 1657498.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 103.87037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8535503149032593, + "kl": 0.04825827130116522, + "learning_rate": 1.464e-06, + "loss": -0.0154, + "num_tokens": 1657789.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 5609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 103.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05372335761785507, + "kl": 0.006013734498992562, + "learning_rate": 1.4636666666666667e-06, + "loss": 0.0003, + "num_tokens": 1658073.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 103.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03579818084836006, + "kl": 0.0003338456153869629, + "learning_rate": 1.4633333333333332e-06, + "loss": 0.0, + "num_tokens": 1658285.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 103.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08977561444044113, + "kl": 0.017718197777867317, + "learning_rate": 1.463e-06, + "loss": 0.001, + "num_tokens": 1658567.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 103.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007300083991140127, + "kl": 0.00182400643825531, + "learning_rate": 1.4626666666666668e-06, + "loss": 0.0001, + "num_tokens": 1658783.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 32.75, + "completions/mean_terminated_length": 32.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 103.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.46247953176498413, + "kl": 0.10592610016465187, + "learning_rate": 1.4623333333333334e-06, + "loss": 0.0056, + "num_tokens": 1659194.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 5614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 103.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03938408941030502, + "kl": 0.0015298050711862743, + "learning_rate": 1.4620000000000001e-06, + "loss": 0.0001, + "num_tokens": 1659516.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 36.75, + "completions/mean_terminated_length": 36.75, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 104.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12710459530353546, + "kl": 0.03987656719982624, + "learning_rate": 1.4616666666666667e-06, + "loss": 0.002, + "num_tokens": 1659887.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 104.01851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.122880220413208, + "kl": 0.09745741845108569, + "learning_rate": 1.4613333333333333e-06, + "loss": 0.0406, + "num_tokens": 1660169.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 5617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 104.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015912484377622604, + "kl": 0.0006690872251056135, + "learning_rate": 1.461e-06, + "loss": 0.0, + "num_tokens": 1660431.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 104.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02676847018301487, + "kl": 0.003936985274776816, + "learning_rate": 1.4606666666666666e-06, + "loss": 0.0002, + "num_tokens": 1660691.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 104.07407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2112481594085693, + "kl": 0.20111770555377007, + "learning_rate": 1.4603333333333332e-06, + "loss": 0.0674, + "num_tokens": 1661029.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 5620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 104.0925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7214977741241455, + "kl": 0.02366841584444046, + "learning_rate": 1.4600000000000002e-06, + "loss": -0.0381, + "num_tokens": 1661303.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 5621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 28.5, + "completions/mean_terminated_length": 28.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 104.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08014106750488281, + "kl": 0.031496042385697365, + "learning_rate": 1.4596666666666668e-06, + "loss": 0.0016, + "num_tokens": 1661637.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.5, + "completions/mean_terminated_length": 4.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 104.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10010218620300293, + "kl": 0.004007552575785667, + "learning_rate": 1.4593333333333334e-06, + "loss": 0.0002, + "num_tokens": 1661855.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 104.14814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.5151543617248535, + "kl": 0.0829065702855587, + "learning_rate": 1.4590000000000001e-06, + "loss": 0.1502, + "num_tokens": 1662138.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 5624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 104.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08586783707141876, + "kl": 0.01794201135635376, + "learning_rate": 1.4586666666666667e-06, + "loss": 0.0009, + "num_tokens": 1662413.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 104.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019931908696889877, + "kl": 0.0005274638533592224, + "learning_rate": 1.4583333333333333e-06, + "loss": 0.0, + "num_tokens": 1662623.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 104.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04691576585173607, + "kl": 0.00750165106728673, + "learning_rate": 1.458e-06, + "loss": 0.0004, + "num_tokens": 1662916.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 104.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008692933479323983, + "kl": 0.003747113049030304, + "learning_rate": 1.4576666666666666e-06, + "loss": 0.0002, + "num_tokens": 1663152.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 104.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002916396129876375, + "kl": 7.414072751998901e-05, + "learning_rate": 1.4573333333333332e-06, + "loss": 0.0, + "num_tokens": 1663364.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 104.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010454630479216576, + "kl": 0.008650038857012987, + "learning_rate": 1.4570000000000002e-06, + "loss": 0.0004, + "num_tokens": 1663636.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 104.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021863967180252075, + "kl": 0.0014585109311155975, + "learning_rate": 1.4566666666666668e-06, + "loss": 0.0001, + "num_tokens": 1663940.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 104.29629629629629, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.051466703414917, + "kl": 0.11221451126039028, + "learning_rate": 1.4563333333333333e-06, + "loss": -0.0166, + "num_tokens": 1664201.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 5632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 104.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051440466195344925, + "kl": 0.0034037778386846185, + "learning_rate": 1.4560000000000001e-06, + "loss": 0.0002, + "num_tokens": 1664465.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 33.5, + "completions/mean_terminated_length": 33.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 104.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08346538990736008, + "kl": 0.006596399703994393, + "learning_rate": 1.4556666666666667e-06, + "loss": 0.0004, + "num_tokens": 1664819.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 104.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0019270085031166673, + "kl": 6.192177897901274e-05, + "learning_rate": 1.4553333333333333e-06, + "loss": 0.0, + "num_tokens": 1665091.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 104.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005148336756974459, + "kl": 0.00031048059463500977, + "learning_rate": 1.455e-06, + "loss": 0.0, + "num_tokens": 1665351.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 104.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04582850635051727, + "kl": 0.002248986216727644, + "learning_rate": 1.4546666666666666e-06, + "loss": 0.0001, + "num_tokens": 1665665.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 104.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06564143300056458, + "kl": 0.0064549262169748545, + "learning_rate": 1.4543333333333332e-06, + "loss": 0.0003, + "num_tokens": 1665995.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 104.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2788805663585663, + "kl": 0.04196894774213433, + "learning_rate": 1.4540000000000002e-06, + "loss": 0.0023, + "num_tokens": 1666282.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 104.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02393532544374466, + "kl": 0.03826040215790272, + "learning_rate": 1.4536666666666668e-06, + "loss": 0.0019, + "num_tokens": 1666686.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 5640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 104.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059652626514434814, + "kl": 0.025676672346889973, + "learning_rate": 1.4533333333333333e-06, + "loss": 0.0013, + "num_tokens": 1666986.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 104.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033939704298973083, + "kl": 0.00343378446996212, + "learning_rate": 1.4530000000000001e-06, + "loss": 0.0002, + "num_tokens": 1667270.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 104.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7776589393615723, + "kl": 0.13623983785510063, + "learning_rate": 1.4526666666666667e-06, + "loss": 0.1711, + "num_tokens": 1667611.0, + "reward": 2.25, + "reward_std": 1.443375587463379, + "rewards/reward_combined/mean": 2.25, + "rewards/reward_combined/std": 1.4433757066726685, + "step": 5643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 39.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 39.25, + "completions/mean_terminated_length": 39.25, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 104.51851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4898393154144287, + "kl": 0.10298647731542587, + "learning_rate": 1.4523333333333332e-06, + "loss": 0.0192, + "num_tokens": 1667996.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 5644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 104.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061361782252788544, + "kl": 0.002435504808090627, + "learning_rate": 1.452e-06, + "loss": 0.0001, + "num_tokens": 1668319.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 104.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03603056073188782, + "kl": 0.006204102421179414, + "learning_rate": 1.4516666666666666e-06, + "loss": 0.0003, + "num_tokens": 1668647.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 34.75, + "completions/mean_terminated_length": 34.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 104.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07706590741872787, + "kl": 0.04034225083887577, + "learning_rate": 1.4513333333333334e-06, + "loss": 0.002, + "num_tokens": 1669010.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 104.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014101438224315643, + "kl": 0.00380882786703296, + "learning_rate": 1.4510000000000002e-06, + "loss": 0.0002, + "num_tokens": 1669274.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 104.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03637513890862465, + "kl": 0.0034911499824374914, + "learning_rate": 1.4506666666666667e-06, + "loss": 0.0002, + "num_tokens": 1669600.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 104.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04727701470255852, + "kl": 0.008573881816118956, + "learning_rate": 1.4503333333333333e-06, + "loss": 0.0004, + "num_tokens": 1669928.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 104.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1317235231399536, + "kl": 0.007987660821527243, + "learning_rate": 1.45e-06, + "loss": 0.0005, + "num_tokens": 1670147.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 104.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00016721435531508178, + "kl": 5.558133125305176e-06, + "learning_rate": 1.4496666666666667e-06, + "loss": 0.0, + "num_tokens": 1670367.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 5652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 104.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05192406848073006, + "kl": 0.004072215291671455, + "learning_rate": 1.4493333333333334e-06, + "loss": 0.0002, + "num_tokens": 1670667.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 104.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2850669026374817, + "kl": 0.04399473685771227, + "learning_rate": 1.449e-06, + "loss": 0.0024, + "num_tokens": 1670963.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 104.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07298750430345535, + "kl": 0.002120365621522069, + "learning_rate": 1.4486666666666666e-06, + "loss": 0.0002, + "num_tokens": 1671179.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 104.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07781278342008591, + "kl": 0.004349625436589122, + "learning_rate": 1.4483333333333334e-06, + "loss": 0.0002, + "num_tokens": 1671427.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 104.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004988092463463545, + "kl": 8.647441791254096e-05, + "learning_rate": 1.4480000000000002e-06, + "loss": 0.0, + "num_tokens": 1671683.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 104.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0739312469959259, + "kl": 0.009387121070176363, + "learning_rate": 1.4476666666666667e-06, + "loss": 0.0005, + "num_tokens": 1672023.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 104.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03655466064810753, + "kl": 0.004713650327175856, + "learning_rate": 1.4473333333333333e-06, + "loss": 0.0002, + "num_tokens": 1672309.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 104.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006822492927312851, + "kl": 0.2677291929721832, + "learning_rate": 1.447e-06, + "loss": 0.0134, + "num_tokens": 1672613.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 104.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03389998897910118, + "kl": 0.002766357734799385, + "learning_rate": 1.4466666666666667e-06, + "loss": 0.0001, + "num_tokens": 1672886.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 104.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20362868905067444, + "kl": 0.03436734527349472, + "learning_rate": 1.4463333333333334e-06, + "loss": 0.0017, + "num_tokens": 1673192.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 104.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0704147070646286, + "kl": 0.16538894921541214, + "learning_rate": 1.446e-06, + "loss": 0.0083, + "num_tokens": 1673501.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 104.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021763790398836136, + "kl": 0.0011930970067624003, + "learning_rate": 1.4456666666666666e-06, + "loss": 0.0001, + "num_tokens": 1673736.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 104.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07429869472980499, + "kl": 0.007324169855564833, + "learning_rate": 1.4453333333333334e-06, + "loss": 0.0004, + "num_tokens": 1674030.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 104.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05000847950577736, + "kl": 0.012160141952335835, + "learning_rate": 1.4450000000000001e-06, + "loss": 0.0006, + "num_tokens": 1674316.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 104.94444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.011960983276367, + "kl": 0.11599424760788679, + "learning_rate": 1.4446666666666667e-06, + "loss": -0.0461, + "num_tokens": 1674586.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 5667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 104.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03617606684565544, + "kl": 0.003139002248644829, + "learning_rate": 1.4443333333333335e-06, + "loss": 0.0002, + "num_tokens": 1674898.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 104.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.36423632502555847, + "kl": 0.05626663938164711, + "learning_rate": 1.444e-06, + "loss": 0.0029, + "num_tokens": 1675227.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 105.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8410458564758301, + "kl": 0.20665767043828964, + "learning_rate": 1.4436666666666666e-06, + "loss": 0.0097, + "num_tokens": 1675599.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 5670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 105.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05876588821411133, + "kl": 0.0005195066332817078, + "learning_rate": 1.4433333333333334e-06, + "loss": 0.0, + "num_tokens": 1675811.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 105.03703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1140060424804688, + "kl": 0.45542351389303803, + "learning_rate": 1.443e-06, + "loss": -0.0565, + "num_tokens": 1676106.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 5672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 105.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008352797478437424, + "kl": 8.147954940795898e-05, + "learning_rate": 1.4426666666666666e-06, + "loss": 0.0, + "num_tokens": 1676318.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 105.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18303248286247253, + "kl": 0.05314611457288265, + "learning_rate": 1.4423333333333333e-06, + "loss": 0.0026, + "num_tokens": 1676638.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 105.0925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8104941844940186, + "kl": 0.27381379902362823, + "learning_rate": 1.4420000000000001e-06, + "loss": -0.0019, + "num_tokens": 1676941.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 5675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 105.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02625441737473011, + "kl": 0.003914707922376692, + "learning_rate": 1.4416666666666667e-06, + "loss": 0.0002, + "num_tokens": 1677201.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 105.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038965094834566116, + "kl": 0.0033664272632449865, + "learning_rate": 1.4413333333333335e-06, + "loss": 0.0002, + "num_tokens": 1677461.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 105.14814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.05448055267334, + "kl": 0.28306935052387416, + "learning_rate": 1.441e-06, + "loss": 0.3229, + "num_tokens": 1677754.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 5678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 40.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 111.0, + "completions/max_terminated_length": 111.0, + "completions/mean_length": 40.0, + "completions/mean_terminated_length": 40.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 105.16666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.566134214401245, + "kl": 0.02297248411923647, + "learning_rate": 1.4406666666666666e-06, + "loss": 0.421, + "num_tokens": 1678134.0, + "reward": 7.300000190734863, + "reward_std": 0.40000009536743164, + "rewards/reward_combined/mean": 7.300000190734863, + "rewards/reward_combined/std": 0.40000009536743164, + "step": 5679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 105.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026033587753772736, + "kl": 0.0010442649654578418, + "learning_rate": 1.4403333333333334e-06, + "loss": 0.0001, + "num_tokens": 1678448.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 36.25, + "completions/mean_terminated_length": 36.25, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 105.20370370370371, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.47929310798645, + "kl": 0.06732975505292416, + "learning_rate": 1.44e-06, + "loss": -0.061, + "num_tokens": 1678809.0, + "reward": 4.625, + "reward_std": 2.25, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 2.25, + "step": 5681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 42.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 114.0, + "completions/max_terminated_length": 114.0, + "completions/mean_length": 42.25, + "completions/mean_terminated_length": 42.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 105.22222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.9458839893341064, + "kl": 0.05476866662502289, + "learning_rate": 1.4396666666666665e-06, + "loss": 0.3693, + "num_tokens": 1679198.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 5682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 105.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10348553210496902, + "kl": 0.005979364272207022, + "learning_rate": 1.4393333333333335e-06, + "loss": 0.0004, + "num_tokens": 1679408.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 105.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4981484115123749, + "kl": 0.06222796067595482, + "learning_rate": 1.4390000000000001e-06, + "loss": 0.0031, + "num_tokens": 1679683.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 105.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018115002661943436, + "kl": 0.00020355880405986682, + "learning_rate": 1.4386666666666667e-06, + "loss": 0.0, + "num_tokens": 1679939.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 29.5, + "completions/mean_terminated_length": 29.5, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 105.29629629629629, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.157373905181885, + "kl": 0.06725869793444872, + "learning_rate": 1.4383333333333335e-06, + "loss": 0.1024, + "num_tokens": 1680309.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 5686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 105.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010161695070564747, + "kl": 0.0003168337279930711, + "learning_rate": 1.438e-06, + "loss": 0.0, + "num_tokens": 1680627.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 105.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004255264066159725, + "kl": 0.0003784775617532432, + "learning_rate": 1.4376666666666666e-06, + "loss": 0.0, + "num_tokens": 1680847.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 105.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012154607102274895, + "kl": 0.09744952619075775, + "learning_rate": 1.4373333333333334e-06, + "loss": 0.0049, + "num_tokens": 1681219.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 105.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005785184446722269, + "kl": 0.0007093921303749084, + "learning_rate": 1.437e-06, + "loss": 0.0, + "num_tokens": 1681463.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 105.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04391011223196983, + "kl": 0.004706941545009613, + "learning_rate": 1.4366666666666665e-06, + "loss": 0.0002, + "num_tokens": 1681679.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 32.0, + "completions/mean_terminated_length": 32.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 105.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06214816868305206, + "kl": 0.009975632186979055, + "learning_rate": 1.4363333333333335e-06, + "loss": 0.0005, + "num_tokens": 1682035.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 105.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07380525767803192, + "kl": 0.008595675462856889, + "learning_rate": 1.436e-06, + "loss": 0.0004, + "num_tokens": 1682329.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 105.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2147248089313507, + "kl": 0.019854821148328483, + "learning_rate": 1.4356666666666667e-06, + "loss": 0.0011, + "num_tokens": 1682602.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 105.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024057865142822266, + "kl": 0.033351522870361805, + "learning_rate": 1.4353333333333335e-06, + "loss": 0.0017, + "num_tokens": 1683006.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 5695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 105.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19571448862552643, + "kl": 0.030045789666473866, + "learning_rate": 1.435e-06, + "loss": 0.0016, + "num_tokens": 1683338.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 105.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020817856420762837, + "kl": 7.569789886474609e-06, + "learning_rate": 1.4346666666666666e-06, + "loss": 0.0, + "num_tokens": 1683558.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 5697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 105.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009750555269420147, + "kl": 0.004015401005744934, + "learning_rate": 1.4343333333333334e-06, + "loss": 0.0002, + "num_tokens": 1683838.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 105.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017596613615751266, + "kl": 0.000529117402038537, + "learning_rate": 1.434e-06, + "loss": 0.0, + "num_tokens": 1684108.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 105.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000807145785074681, + "kl": 0.0037572383880615234, + "learning_rate": 1.4336666666666665e-06, + "loss": 0.0002, + "num_tokens": 1684344.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 105.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04011266306042671, + "kl": 0.006399928824976087, + "learning_rate": 1.4333333333333335e-06, + "loss": 0.0003, + "num_tokens": 1684639.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 105.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09016400575637817, + "kl": 0.1637478619813919, + "learning_rate": 1.433e-06, + "loss": 0.0082, + "num_tokens": 1684949.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 105.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1597907692193985, + "kl": 0.03237841837108135, + "learning_rate": 1.4326666666666667e-06, + "loss": 0.0017, + "num_tokens": 1685249.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 33.75, + "completions/mean_terminated_length": 33.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 105.62962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.633193016052246, + "kl": 0.8263365607708693, + "learning_rate": 1.4323333333333334e-06, + "loss": 0.176, + "num_tokens": 1685608.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 5704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 105.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2390611320734024, + "kl": 0.03568451013416052, + "learning_rate": 1.432e-06, + "loss": 0.0018, + "num_tokens": 1685930.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5705 + }, + { + "clip_ratio/high_max": 0.006410256493836641, + "clip_ratio/high_mean": 0.006410256493836641, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006410256493836641, + "completion_length": 38.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 38.0, + "completions/mean_terminated_length": 38.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 105.66666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5669021606445312, + "kl": 0.17164346575737, + "learning_rate": 1.4316666666666666e-06, + "loss": 0.0211, + "num_tokens": 1686310.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 5706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 105.68518518518519, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8500077724456787, + "kl": 0.04501553252339363, + "learning_rate": 1.4313333333333334e-06, + "loss": 0.2092, + "num_tokens": 1686604.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 5707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 105.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09812217950820923, + "kl": 0.0398064237087965, + "learning_rate": 1.431e-06, + "loss": 0.0021, + "num_tokens": 1686920.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 105.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2176378071308136, + "kl": 0.019492600520607084, + "learning_rate": 1.4306666666666667e-06, + "loss": 0.0011, + "num_tokens": 1687202.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 105.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048696164041757584, + "kl": 0.006918958388268948, + "learning_rate": 1.4303333333333335e-06, + "loss": 0.0004, + "num_tokens": 1687500.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 105.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0727105364203453, + "kl": 0.010666396003216505, + "learning_rate": 1.43e-06, + "loss": 0.0006, + "num_tokens": 1687794.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 105.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09294035285711288, + "kl": 0.009225911926478148, + "learning_rate": 1.4296666666666666e-06, + "loss": 0.0005, + "num_tokens": 1688134.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 105.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012998809106647968, + "kl": 0.0015042490558698773, + "learning_rate": 1.4293333333333334e-06, + "loss": 0.0001, + "num_tokens": 1688411.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 28.25, + "completions/mean_terminated_length": 28.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 105.81481481481481, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8534812927246094, + "kl": 0.0837114229798317, + "learning_rate": 1.429e-06, + "loss": 0.1406, + "num_tokens": 1688760.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 5714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 105.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039196960628032684, + "kl": 0.0017774586740415543, + "learning_rate": 1.4286666666666668e-06, + "loss": 0.0001, + "num_tokens": 1689062.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 105.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002460814779624343, + "kl": 0.00044571078615263104, + "learning_rate": 1.4283333333333334e-06, + "loss": 0.0, + "num_tokens": 1689296.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 105.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08185262233018875, + "kl": 0.007727860473096371, + "learning_rate": 1.428e-06, + "loss": 0.0004, + "num_tokens": 1689568.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 105.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01775561459362507, + "kl": 0.002369391731917858, + "learning_rate": 1.4276666666666667e-06, + "loss": 0.0001, + "num_tokens": 1689880.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 105.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3227272629737854, + "kl": 0.057049446273595095, + "learning_rate": 1.4273333333333335e-06, + "loss": 0.0036, + "num_tokens": 1690177.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 105.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1344035565853119, + "kl": 0.013386741280555725, + "learning_rate": 1.427e-06, + "loss": 0.0007, + "num_tokens": 1690451.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 105.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02759510837495327, + "kl": 0.0032271374948322773, + "learning_rate": 1.4266666666666666e-06, + "loss": 0.0002, + "num_tokens": 1690741.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 105.96296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.844181060791016, + "kl": 0.060532329604029655, + "learning_rate": 1.4263333333333334e-06, + "loss": -0.1647, + "num_tokens": 1691052.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 5722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 105.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008276209235191345, + "kl": 0.00040553510189056396, + "learning_rate": 1.426e-06, + "loss": 0.0, + "num_tokens": 1691312.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 106.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5478247404098511, + "kl": 0.10455014044418931, + "learning_rate": 1.4256666666666668e-06, + "loss": 0.0056, + "num_tokens": 1691582.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 106.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02557133510708809, + "kl": 0.0004163682460784912, + "learning_rate": 1.4253333333333333e-06, + "loss": 0.0, + "num_tokens": 1691792.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 106.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050726309418678284, + "kl": 0.011864948086440563, + "learning_rate": 1.425e-06, + "loss": 0.0006, + "num_tokens": 1692126.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 106.05555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0785820484161377, + "kl": 0.05373286455869675, + "learning_rate": 1.4246666666666667e-06, + "loss": 0.0204, + "num_tokens": 1692397.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 5727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 28.25, + "completions/mean_terminated_length": 28.25, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 106.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061437562108039856, + "kl": 0.03129961155354977, + "learning_rate": 1.4243333333333335e-06, + "loss": 0.0016, + "num_tokens": 1692730.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 106.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13399158418178558, + "kl": 0.024402556009590626, + "learning_rate": 1.424e-06, + "loss": 0.0013, + "num_tokens": 1693063.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 106.11111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8997130393981934, + "kl": 0.11000430583953857, + "learning_rate": 1.4236666666666668e-06, + "loss": 0.169, + "num_tokens": 1693423.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 5730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 106.12962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.107804775238037, + "kl": 0.057356974110007286, + "learning_rate": 1.4233333333333334e-06, + "loss": 0.115, + "num_tokens": 1693733.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 5731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 106.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03562408313155174, + "kl": 0.009137207642197609, + "learning_rate": 1.423e-06, + "loss": 0.0004, + "num_tokens": 1694028.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 106.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026240911334753036, + "kl": 0.0019951232243329287, + "learning_rate": 1.4226666666666668e-06, + "loss": 0.0001, + "num_tokens": 1694324.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 6.0, + "completions/mean_terminated_length": 6.0, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 106.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044375013560056686, + "kl": 0.0008645802736282349, + "learning_rate": 1.4223333333333333e-06, + "loss": 0.0, + "num_tokens": 1694556.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 106.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0463753268122673, + "kl": 0.01417520921677351, + "learning_rate": 1.422e-06, + "loss": 0.0007, + "num_tokens": 1694836.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 34.0, + "completions/mean_terminated_length": 34.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 106.22222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1392743587493896, + "kl": 0.10343851149082184, + "learning_rate": 1.4216666666666667e-06, + "loss": -0.0832, + "num_tokens": 1695200.0, + "reward": 5.625, + "reward_std": 2.75, + "rewards/reward_combined/mean": 5.625, + "rewards/reward_combined/std": 2.75, + "step": 5736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 106.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02173640951514244, + "kl": 0.000323873755405657, + "learning_rate": 1.4213333333333335e-06, + "loss": 0.0, + "num_tokens": 1695456.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 106.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04348168149590492, + "kl": 0.007410618243739009, + "learning_rate": 1.421e-06, + "loss": 0.0004, + "num_tokens": 1695747.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 106.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010557022877037525, + "kl": 0.163717582821846, + "learning_rate": 1.4206666666666668e-06, + "loss": 0.0082, + "num_tokens": 1696055.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 106.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08273578435182571, + "kl": 0.004094600095413625, + "learning_rate": 1.4203333333333334e-06, + "loss": 0.0002, + "num_tokens": 1696368.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 106.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06082385778427124, + "kl": 0.007373373955488205, + "learning_rate": 1.42e-06, + "loss": 0.0004, + "num_tokens": 1696707.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 41.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 41.0, + "completions/mean_terminated_length": 41.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 106.33333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6933389902114868, + "kl": 0.10848630405962467, + "learning_rate": 1.4196666666666667e-06, + "loss": 0.092, + "num_tokens": 1697095.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 5742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 26.25, + "completions/mean_terminated_length": 26.25, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 106.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014743284322321415, + "kl": 0.09623927995562553, + "learning_rate": 1.4193333333333333e-06, + "loss": 0.0048, + "num_tokens": 1697468.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 106.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008723534410819411, + "kl": 0.003734111785888672, + "learning_rate": 1.4189999999999999e-06, + "loss": 0.0002, + "num_tokens": 1697704.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 106.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046619925647974014, + "kl": 0.0046431400696747005, + "learning_rate": 1.4186666666666669e-06, + "loss": 0.0003, + "num_tokens": 1697966.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 106.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009092474356293678, + "kl": 0.001768447458744049, + "learning_rate": 1.4183333333333334e-06, + "loss": 0.0001, + "num_tokens": 1698182.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 106.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0892452523112297, + "kl": 0.006601458648219705, + "learning_rate": 1.418e-06, + "loss": 0.0004, + "num_tokens": 1698456.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 106.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01983843743801117, + "kl": 0.0015462132869288325, + "learning_rate": 1.4176666666666668e-06, + "loss": 0.0001, + "num_tokens": 1698740.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 106.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0084643280133605, + "kl": 0.26748037338256836, + "learning_rate": 1.4173333333333334e-06, + "loss": 0.0134, + "num_tokens": 1699044.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 106.48148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.700222492218018, + "kl": 0.01191516499966383, + "learning_rate": 1.417e-06, + "loss": 0.2333, + "num_tokens": 1699338.0, + "reward": 5.875, + "reward_std": 3.25, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 3.25, + "step": 5750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 106.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016187675297260284, + "kl": 0.0024367207661271095, + "learning_rate": 1.4166666666666667e-06, + "loss": 0.0001, + "num_tokens": 1699650.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 106.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02424757368862629, + "kl": 0.0009895925759337842, + "learning_rate": 1.4163333333333333e-06, + "loss": 0.0, + "num_tokens": 1699920.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 30.5, + "completions/mean_terminated_length": 30.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 106.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11237373948097229, + "kl": 0.02521005505695939, + "learning_rate": 1.4159999999999999e-06, + "loss": 0.0013, + "num_tokens": 1700270.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 106.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01388302631676197, + "kl": 0.003629215876571834, + "learning_rate": 1.4156666666666669e-06, + "loss": 0.0002, + "num_tokens": 1700548.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 106.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06473202258348465, + "kl": 0.006293442100286484, + "learning_rate": 1.4153333333333334e-06, + "loss": 0.0003, + "num_tokens": 1700869.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 106.5925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0050394535064697, + "kl": 0.030086702667176723, + "learning_rate": 1.415e-06, + "loss": 0.0002, + "num_tokens": 1701217.0, + "reward": 5.25, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 2.598076105117798, + "step": 5756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 106.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.69318675994873, + "kl": 0.02784450352191925, + "learning_rate": 1.4146666666666668e-06, + "loss": 0.0684, + "num_tokens": 1701479.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 5757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 106.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14703218638896942, + "kl": 0.004087294219061732, + "learning_rate": 1.4143333333333334e-06, + "loss": 0.0003, + "num_tokens": 1701699.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 106.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03757351636886597, + "kl": 0.002771433792077005, + "learning_rate": 1.414e-06, + "loss": 0.0001, + "num_tokens": 1701959.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 106.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002695656439755112, + "kl": 1.4044344425201416e-05, + "learning_rate": 1.4136666666666667e-06, + "loss": 0.0, + "num_tokens": 1702179.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 5760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 106.68518518518519, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6176536083221436, + "kl": 0.042499613016843796, + "learning_rate": 1.4133333333333333e-06, + "loss": 0.0027, + "num_tokens": 1702473.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 5761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 106.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09699300676584244, + "kl": 0.015115597750991583, + "learning_rate": 1.4129999999999999e-06, + "loss": 0.0008, + "num_tokens": 1702737.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 106.72222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.399329662322998, + "kl": 0.02724478906020522, + "learning_rate": 1.4126666666666668e-06, + "loss": 0.0516, + "num_tokens": 1703018.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 5763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 106.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006087391637265682, + "kl": 0.00012576580047607422, + "learning_rate": 1.4123333333333334e-06, + "loss": 0.0, + "num_tokens": 1703230.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 106.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06041878089308739, + "kl": 0.00847070338204503, + "learning_rate": 1.412e-06, + "loss": 0.0004, + "num_tokens": 1703522.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 106.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03455105051398277, + "kl": 0.002691088360734284, + "learning_rate": 1.4116666666666668e-06, + "loss": 0.0001, + "num_tokens": 1703830.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 34.25, + "completions/mean_terminated_length": 34.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 106.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01585678942501545, + "kl": 0.035224828869104385, + "learning_rate": 1.4113333333333333e-06, + "loss": 0.0017, + "num_tokens": 1704247.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 5767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 106.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08282896131277084, + "kl": 0.0035541802644729614, + "learning_rate": 1.411e-06, + "loss": 0.0002, + "num_tokens": 1704491.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 106.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026355255395174026, + "kl": 0.0038483203388750553, + "learning_rate": 1.4106666666666667e-06, + "loss": 0.0002, + "num_tokens": 1704751.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 106.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.28263741731643677, + "kl": 0.033234777161851525, + "learning_rate": 1.4103333333333333e-06, + "loss": 0.0015, + "num_tokens": 1705074.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 106.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03897060081362724, + "kl": 0.01066239271312952, + "learning_rate": 1.41e-06, + "loss": 0.0005, + "num_tokens": 1705358.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 106.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0034062473569065332, + "kl": 0.0003975331783294678, + "learning_rate": 1.4096666666666668e-06, + "loss": 0.0, + "num_tokens": 1705578.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 106.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04890957847237587, + "kl": 0.009005682077258825, + "learning_rate": 1.4093333333333334e-06, + "loss": 0.0005, + "num_tokens": 1705846.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 106.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009102567099034786, + "kl": 0.0004160125972703099, + "learning_rate": 1.409e-06, + "loss": 0.0, + "num_tokens": 1706164.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 106.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008985929889604449, + "kl": 0.001148223876953125, + "learning_rate": 1.4086666666666668e-06, + "loss": 0.0001, + "num_tokens": 1706444.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 34.0, + "completions/mean_terminated_length": 34.0, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 106.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04493812471628189, + "kl": 0.04164758883416653, + "learning_rate": 1.4083333333333333e-06, + "loss": 0.0022, + "num_tokens": 1706796.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.75, + "completions/mean_terminated_length": 10.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 106.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07303241640329361, + "kl": 0.006453113630414009, + "learning_rate": 1.4080000000000001e-06, + "loss": 0.0003, + "num_tokens": 1707067.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 107.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09221452474594116, + "kl": 0.01181476260535419, + "learning_rate": 1.4076666666666667e-06, + "loss": 0.0006, + "num_tokens": 1707337.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 107.01851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9778470993041992, + "kl": 0.011919782496988773, + "learning_rate": 1.4073333333333333e-06, + "loss": 0.0454, + "num_tokens": 1707630.0, + "reward": 7.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.875, + "rewards/reward_combined/std": 0.25, + "step": 5779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 107.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029590746387839317, + "kl": 0.0021514305844902992, + "learning_rate": 1.407e-06, + "loss": 0.0001, + "num_tokens": 1707939.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 107.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07139435410499573, + "kl": 0.013602410908788443, + "learning_rate": 1.4066666666666668e-06, + "loss": 0.0007, + "num_tokens": 1708270.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 107.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0078984210267663, + "kl": 0.26756927371025085, + "learning_rate": 1.4063333333333334e-06, + "loss": 0.0134, + "num_tokens": 1708574.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 107.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06545332074165344, + "kl": 0.01546543464064598, + "learning_rate": 1.406e-06, + "loss": 0.0009, + "num_tokens": 1708856.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 107.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006662515923380852, + "kl": 4.482269287109375e-05, + "learning_rate": 1.4056666666666667e-06, + "loss": 0.0, + "num_tokens": 1709068.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 107.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005747989285737276, + "kl": 0.000629764050245285, + "learning_rate": 1.4053333333333333e-06, + "loss": 0.0, + "num_tokens": 1709312.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 107.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05726177617907524, + "kl": 0.013715836685150862, + "learning_rate": 1.405e-06, + "loss": 0.0007, + "num_tokens": 1709618.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 107.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012316490523517132, + "kl": 0.09743694961071014, + "learning_rate": 1.4046666666666667e-06, + "loss": 0.0049, + "num_tokens": 1709990.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 107.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03672131150960922, + "kl": 0.0024512840900570154, + "learning_rate": 1.4043333333333332e-06, + "loss": 0.0001, + "num_tokens": 1710250.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 107.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06388644874095917, + "kl": 0.0028850616654381156, + "learning_rate": 1.404e-06, + "loss": 0.0001, + "num_tokens": 1710520.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 107.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05095580220222473, + "kl": 0.00374322640709579, + "learning_rate": 1.4036666666666668e-06, + "loss": 0.0002, + "num_tokens": 1710825.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 107.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034039080142974854, + "kl": 0.00621016975492239, + "learning_rate": 1.4033333333333334e-06, + "loss": 0.0003, + "num_tokens": 1711115.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 107.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21955059468746185, + "kl": 0.026480155996978283, + "learning_rate": 1.4030000000000002e-06, + "loss": 0.0012, + "num_tokens": 1711376.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 34.5, + "completions/mean_terminated_length": 34.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 107.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10853764414787292, + "kl": 0.018656094325706363, + "learning_rate": 1.4026666666666667e-06, + "loss": 0.001, + "num_tokens": 1711734.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 107.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009091355837881565, + "kl": 0.00041239398706238717, + "learning_rate": 1.4023333333333333e-06, + "loss": 0.0, + "num_tokens": 1712054.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 107.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07535656541585922, + "kl": 0.1569998860359192, + "learning_rate": 1.402e-06, + "loss": 0.0078, + "num_tokens": 1712369.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 45.0, + "completions/max_terminated_length": 45.0, + "completions/mean_length": 34.25, + "completions/mean_terminated_length": 34.25, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 107.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5146821141242981, + "kl": 0.10110709443688393, + "learning_rate": 1.4016666666666667e-06, + "loss": 0.005, + "num_tokens": 1712786.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 5796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 107.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6395635008811951, + "kl": 0.07085668295621872, + "learning_rate": 1.4013333333333332e-06, + "loss": 0.0039, + "num_tokens": 1713077.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 107.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1346423178911209, + "kl": 0.015416470589116216, + "learning_rate": 1.401e-06, + "loss": 0.0008, + "num_tokens": 1713407.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 32.75, + "completions/mean_terminated_length": 32.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 107.38888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.016915321350098, + "kl": 0.12276065722107887, + "learning_rate": 1.4006666666666668e-06, + "loss": -0.1486, + "num_tokens": 1713766.0, + "reward": 7.75, + "reward_std": 0.28867512941360474, + "rewards/reward_combined/mean": 7.75, + "rewards/reward_combined/std": 0.28867512941360474, + "step": 5799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 107.4074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.519293785095215, + "kl": 0.03627302497625351, + "learning_rate": 1.4003333333333334e-06, + "loss": -0.0489, + "num_tokens": 1714116.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 5800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 107.42592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4300806522369385, + "kl": 0.6645773788914084, + "learning_rate": 1.4000000000000001e-06, + "loss": -0.0207, + "num_tokens": 1714388.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 5801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 107.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007795859128236771, + "kl": 0.0020574331283569336, + "learning_rate": 1.3996666666666667e-06, + "loss": 0.0001, + "num_tokens": 1714604.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 107.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033531829714775085, + "kl": 0.0029731886461377144, + "learning_rate": 1.3993333333333333e-06, + "loss": 0.0001, + "num_tokens": 1714916.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 107.48148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.84966778755188, + "kl": 0.046946557238698006, + "learning_rate": 1.399e-06, + "loss": -0.1613, + "num_tokens": 1715227.0, + "reward": 6.75, + "reward_std": 2.5, + "rewards/reward_combined/mean": 6.75, + "rewards/reward_combined/std": 2.5, + "step": 5804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 107.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026002289727330208, + "kl": 0.0014131814241409302, + "learning_rate": 1.3986666666666666e-06, + "loss": 0.0001, + "num_tokens": 1715439.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.5, + "completions/mean_terminated_length": 4.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 107.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03955051675438881, + "kl": 0.002813410828821361, + "learning_rate": 1.3983333333333332e-06, + "loss": 0.0001, + "num_tokens": 1715657.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 107.53703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.937253952026367, + "kl": 0.12958155944943428, + "learning_rate": 1.3980000000000002e-06, + "loss": -0.0412, + "num_tokens": 1715948.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 5807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 107.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08042522519826889, + "kl": 0.031298305839300156, + "learning_rate": 1.3976666666666668e-06, + "loss": 0.0016, + "num_tokens": 1716250.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 107.57407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7190862894058228, + "kl": 0.07656094618141651, + "learning_rate": 1.3973333333333334e-06, + "loss": 0.0428, + "num_tokens": 1716586.0, + "reward": 6.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 6.5, + "rewards/reward_combined/std": 2.0, + "step": 5809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 107.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008943743072450161, + "kl": 0.00032437642221339047, + "learning_rate": 1.3970000000000001e-06, + "loss": 0.0, + "num_tokens": 1716856.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 107.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2300005555152893, + "kl": 0.022971992380917072, + "learning_rate": 1.3966666666666667e-06, + "loss": 0.0012, + "num_tokens": 1717136.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 107.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12222582846879959, + "kl": 0.041981762275099754, + "learning_rate": 1.3963333333333333e-06, + "loss": 0.0024, + "num_tokens": 1717457.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 107.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008262148476205766, + "kl": 0.001231085043400526, + "learning_rate": 1.396e-06, + "loss": 0.0001, + "num_tokens": 1717737.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 107.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08274615556001663, + "kl": 0.00374753400683403, + "learning_rate": 1.3956666666666666e-06, + "loss": 0.0002, + "num_tokens": 1717997.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 107.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05769728869199753, + "kl": 0.007248182548210025, + "learning_rate": 1.3953333333333332e-06, + "loss": 0.0004, + "num_tokens": 1718269.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 38.0, + "completions/mean_terminated_length": 38.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 107.70370370370371, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3610689640045166, + "kl": 0.04986991360783577, + "learning_rate": 1.3950000000000002e-06, + "loss": 0.0078, + "num_tokens": 1718637.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 5816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 107.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14683116972446442, + "kl": 0.012522359378635883, + "learning_rate": 1.3946666666666668e-06, + "loss": 0.0006, + "num_tokens": 1718897.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 107.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07415807247161865, + "kl": 0.01777966320514679, + "learning_rate": 1.3943333333333333e-06, + "loss": 0.0009, + "num_tokens": 1719222.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 107.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0116240493953228, + "kl": 0.0008336433675140142, + "learning_rate": 1.3940000000000001e-06, + "loss": 0.0, + "num_tokens": 1719457.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 107.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008049916476011276, + "kl": 0.00030324608087539673, + "learning_rate": 1.3936666666666667e-06, + "loss": 0.0, + "num_tokens": 1719665.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 107.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018872126936912537, + "kl": 0.002289417083375156, + "learning_rate": 1.3933333333333333e-06, + "loss": 0.0001, + "num_tokens": 1719947.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 107.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007522659143432975, + "kl": 0.0037667453289031982, + "learning_rate": 1.393e-06, + "loss": 0.0002, + "num_tokens": 1720183.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 107.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02633531577885151, + "kl": 0.0032656221883371472, + "learning_rate": 1.3926666666666666e-06, + "loss": 0.0002, + "num_tokens": 1720443.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 107.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10278723388910294, + "kl": 0.036409737542271614, + "learning_rate": 1.3923333333333332e-06, + "loss": 0.0017, + "num_tokens": 1720720.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.75, + "completions/mean_terminated_length": 11.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 107.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0425138883292675, + "kl": 0.004189606406725943, + "learning_rate": 1.3920000000000002e-06, + "loss": 0.0002, + "num_tokens": 1720995.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 107.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001723933091852814, + "kl": 5.6549906730651855e-06, + "learning_rate": 1.3916666666666668e-06, + "loss": 0.0, + "num_tokens": 1721215.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 5826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 107.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03437779098749161, + "kl": 0.002885287278331816, + "learning_rate": 1.3913333333333333e-06, + "loss": 0.0001, + "num_tokens": 1721543.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 107.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07442420721054077, + "kl": 0.011151036713272333, + "learning_rate": 1.3910000000000001e-06, + "loss": 0.0006, + "num_tokens": 1721843.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 39.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 39.0, + "completions/mean_terminated_length": 39.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 107.94444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.9135982990264893, + "kl": 0.12119313701987267, + "learning_rate": 1.3906666666666667e-06, + "loss": 0.2191, + "num_tokens": 1722223.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 5829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 107.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034506842494010925, + "kl": 0.0037065488286316395, + "learning_rate": 1.3903333333333332e-06, + "loss": 0.0002, + "num_tokens": 1722551.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 107.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7274201512336731, + "kl": 0.03486140817403793, + "learning_rate": 1.39e-06, + "loss": 0.0017, + "num_tokens": 1722819.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 108.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028645237907767296, + "kl": 0.0005542500639421633, + "learning_rate": 1.3896666666666666e-06, + "loss": 0.0, + "num_tokens": 1723076.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 108.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.29568982124328613, + "kl": 0.041215645149350166, + "learning_rate": 1.3893333333333334e-06, + "loss": 0.0024, + "num_tokens": 1723364.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 24.25, + "completions/mean_terminated_length": 24.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 108.03703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.688390254974365, + "kl": 0.031630974262952805, + "learning_rate": 1.3890000000000002e-06, + "loss": 0.3448, + "num_tokens": 1723681.0, + "reward": 7.0, + "reward_std": 1.0, + "rewards/reward_combined/mean": 7.0, + "rewards/reward_combined/std": 1.0, + "step": 5834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 108.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043932944536209106, + "kl": 0.017253287136554718, + "learning_rate": 1.3886666666666667e-06, + "loss": 0.0009, + "num_tokens": 1723973.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 42.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 42.75, + "completions/mean_terminated_length": 42.75, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 108.07407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.387324333190918, + "kl": 0.08686452358961105, + "learning_rate": 1.3883333333333333e-06, + "loss": 0.0595, + "num_tokens": 1724360.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 5836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 108.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00738773075863719, + "kl": 0.0019551292061805725, + "learning_rate": 1.388e-06, + "loss": 0.0001, + "num_tokens": 1724576.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 108.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03217955306172371, + "kl": 0.006154460366815329, + "learning_rate": 1.3876666666666667e-06, + "loss": 0.0003, + "num_tokens": 1724912.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 108.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02707524597644806, + "kl": 0.0025644907727837563, + "learning_rate": 1.3873333333333334e-06, + "loss": 0.0001, + "num_tokens": 1725191.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 108.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013815261423587799, + "kl": 0.09787509590387344, + "learning_rate": 1.387e-06, + "loss": 0.0049, + "num_tokens": 1725563.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 108.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011325486935675144, + "kl": 0.00042543228482827544, + "learning_rate": 1.3866666666666666e-06, + "loss": 0.0, + "num_tokens": 1725882.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 108.18518518518519, + "frac_reward_zero_std": 0.0, + "grad_norm": 16.690441131591797, + "kl": 1.1164008472114801, + "learning_rate": 1.3863333333333334e-06, + "loss": 0.0721, + "num_tokens": 1726156.0, + "reward": 4.5, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 4.041451930999756, + "step": 5842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 24.25, + "completions/mean_terminated_length": 24.25, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 108.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1334773600101471, + "kl": 0.042811835184693336, + "learning_rate": 1.3860000000000002e-06, + "loss": 0.0021, + "num_tokens": 1726473.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 108.22222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.16800856590271, + "kl": 0.0005193196120671928, + "learning_rate": 1.3856666666666667e-06, + "loss": 0.0516, + "num_tokens": 1726789.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 5844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 108.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027450840920209885, + "kl": 0.0009651482105255127, + "learning_rate": 1.3853333333333333e-06, + "loss": 0.0001, + "num_tokens": 1727005.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 108.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021008362993597984, + "kl": 0.0007641123665962368, + "learning_rate": 1.385e-06, + "loss": 0.0, + "num_tokens": 1727283.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 108.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010142466984689236, + "kl": 0.0020509595051407814, + "learning_rate": 1.3846666666666667e-06, + "loss": 0.0001, + "num_tokens": 1727595.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 108.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07904906570911407, + "kl": 0.06495491415262222, + "learning_rate": 1.3843333333333334e-06, + "loss": 0.0032, + "num_tokens": 1727931.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 108.31481481481481, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.950024127960205, + "kl": 0.15229973196983337, + "learning_rate": 1.384e-06, + "loss": -0.0119, + "num_tokens": 1728216.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 5849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 108.33333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.519428730010986, + "kl": 0.059378063306212425, + "learning_rate": 1.3836666666666666e-06, + "loss": -0.0929, + "num_tokens": 1728532.0, + "reward": 6.75, + "reward_std": 2.5, + "rewards/reward_combined/mean": 6.75, + "rewards/reward_combined/std": 2.5, + "step": 5850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 108.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01905548758804798, + "kl": 0.0007375776185654104, + "learning_rate": 1.3833333333333334e-06, + "loss": 0.0, + "num_tokens": 1728796.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 9.75, + "completions/mean_terminated_length": 9.75, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 108.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0805647149682045, + "kl": 0.017920501995831728, + "learning_rate": 1.3830000000000001e-06, + "loss": 0.001, + "num_tokens": 1729059.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 108.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022450012329500169, + "kl": 9.082257747650146e-06, + "learning_rate": 1.3826666666666667e-06, + "loss": 0.0, + "num_tokens": 1729279.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 5853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 108.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15444353222846985, + "kl": 0.02643286157399416, + "learning_rate": 1.3823333333333335e-06, + "loss": 0.0013, + "num_tokens": 1729547.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 108.42592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.534971714019775, + "kl": 0.09001019224524498, + "learning_rate": 1.382e-06, + "loss": -0.0189, + "num_tokens": 1729849.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 5855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 108.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04774514585733414, + "kl": 0.006973513402044773, + "learning_rate": 1.3816666666666666e-06, + "loss": 0.0003, + "num_tokens": 1730138.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 108.46296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.169586658477783, + "kl": 0.1647091545164585, + "learning_rate": 1.3813333333333334e-06, + "loss": 0.1695, + "num_tokens": 1730476.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 5857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 108.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03908167779445648, + "kl": 0.0012298872461542487, + "learning_rate": 1.381e-06, + "loss": 0.0001, + "num_tokens": 1730711.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 108.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059330303221940994, + "kl": 0.004590214230120182, + "learning_rate": 1.3806666666666666e-06, + "loss": 0.0002, + "num_tokens": 1731017.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 108.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012992694973945618, + "kl": 0.0009702034294605255, + "learning_rate": 1.3803333333333333e-06, + "loss": 0.0, + "num_tokens": 1731277.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 108.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5483787059783936, + "kl": 0.07131713803391904, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.0039, + "num_tokens": 1731564.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 108.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01057702861726284, + "kl": 0.0004176706133875996, + "learning_rate": 1.3796666666666667e-06, + "loss": 0.0, + "num_tokens": 1731834.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 108.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018096506595611572, + "kl": 0.0004948079586029053, + "learning_rate": 1.3793333333333335e-06, + "loss": 0.0, + "num_tokens": 1732042.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 108.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06367022544145584, + "kl": 0.02664570207707584, + "learning_rate": 1.379e-06, + "loss": 0.0009, + "num_tokens": 1732365.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 108.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006638620048761368, + "kl": 0.2678298354148865, + "learning_rate": 1.3786666666666666e-06, + "loss": 0.0134, + "num_tokens": 1732669.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 108.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02574746496975422, + "kl": 0.033431777730584145, + "learning_rate": 1.3783333333333334e-06, + "loss": 0.0017, + "num_tokens": 1733073.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 5866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 108.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013389160856604576, + "kl": 0.0002528697223169729, + "learning_rate": 1.378e-06, + "loss": 0.0, + "num_tokens": 1733286.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 108.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0481690987944603, + "kl": 0.0010759511023934465, + "learning_rate": 1.3776666666666665e-06, + "loss": 0.0001, + "num_tokens": 1733543.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 108.68518518518519, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.645321846008301, + "kl": 0.0175365237519145, + "learning_rate": 1.3773333333333335e-06, + "loss": -0.0406, + "num_tokens": 1733842.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 5869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 108.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09285957366228104, + "kl": 0.042318107560276985, + "learning_rate": 1.3770000000000001e-06, + "loss": 0.0021, + "num_tokens": 1734181.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 12.25, + "completions/mean_terminated_length": 12.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 108.72222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1753692626953125, + "kl": 0.17156246025115252, + "learning_rate": 1.3766666666666667e-06, + "loss": 0.1495, + "num_tokens": 1734454.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 5871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 108.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040205683559179306, + "kl": 0.0023798730690032244, + "learning_rate": 1.3763333333333335e-06, + "loss": 0.0001, + "num_tokens": 1734724.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 34.5, + "completions/mean_terminated_length": 34.5, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 108.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028733454644680023, + "kl": 0.019942507147789, + "learning_rate": 1.376e-06, + "loss": 0.001, + "num_tokens": 1735086.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 108.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04645894840359688, + "kl": 0.003785040695220232, + "learning_rate": 1.3756666666666666e-06, + "loss": 0.0002, + "num_tokens": 1735384.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 108.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06303104013204575, + "kl": 0.0027426481246948242, + "learning_rate": 1.3753333333333334e-06, + "loss": 0.0001, + "num_tokens": 1735638.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 108.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.31005018949508667, + "kl": 0.054848093539476395, + "learning_rate": 1.375e-06, + "loss": 0.0026, + "num_tokens": 1735926.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 108.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040029969066381454, + "kl": 0.16291894018650055, + "learning_rate": 1.3746666666666665e-06, + "loss": 0.0081, + "num_tokens": 1736235.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 108.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004446973092854023, + "kl": 0.0003317773371236399, + "learning_rate": 1.3743333333333335e-06, + "loss": 0.0, + "num_tokens": 1736495.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 108.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008780839270912111, + "kl": 0.0037524476647377014, + "learning_rate": 1.374e-06, + "loss": 0.0002, + "num_tokens": 1736731.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 108.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06579489260911942, + "kl": 0.00977464858442545, + "learning_rate": 1.3736666666666667e-06, + "loss": 0.0005, + "num_tokens": 1737062.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 108.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04814428091049194, + "kl": 0.01773000694811344, + "learning_rate": 1.3733333333333335e-06, + "loss": 0.0009, + "num_tokens": 1737358.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 108.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11146484315395355, + "kl": 0.02189553901553154, + "learning_rate": 1.373e-06, + "loss": 0.0011, + "num_tokens": 1737682.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.25, + "completions/mean_terminated_length": 5.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 108.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3046683967113495, + "kl": 0.02081088093109429, + "learning_rate": 1.3726666666666666e-06, + "loss": 0.0012, + "num_tokens": 1737903.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 39.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 39.25, + "completions/mean_terminated_length": 39.25, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 108.96296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0937881469726562, + "kl": 0.08825670927762985, + "learning_rate": 1.3723333333333334e-06, + "loss": 0.022, + "num_tokens": 1738288.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 5884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 31.75, + "completions/mean_terminated_length": 31.75, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 108.98148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5862605571746826, + "kl": 0.07976600714027882, + "learning_rate": 1.372e-06, + "loss": 0.0382, + "num_tokens": 1738643.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 5885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 109.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10566570609807968, + "kl": 0.005577021976932883, + "learning_rate": 1.3716666666666665e-06, + "loss": 0.0003, + "num_tokens": 1738903.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 109.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03496198356151581, + "kl": 0.002271134697366506, + "learning_rate": 1.3713333333333335e-06, + "loss": 0.0001, + "num_tokens": 1739176.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 109.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01300653163343668, + "kl": 0.009744971990585327, + "learning_rate": 1.371e-06, + "loss": 0.0005, + "num_tokens": 1739480.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 109.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05436874181032181, + "kl": 0.012729277834296227, + "learning_rate": 1.3706666666666667e-06, + "loss": 0.0006, + "num_tokens": 1739773.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 109.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03597012534737587, + "kl": 0.003065040917135775, + "learning_rate": 1.3703333333333334e-06, + "loss": 0.0002, + "num_tokens": 1740055.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 109.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0835784301161766, + "kl": 0.0030799253727309406, + "learning_rate": 1.37e-06, + "loss": 0.0001, + "num_tokens": 1740274.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 109.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00026008597342297435, + "kl": 1.1272728443145752e-05, + "learning_rate": 1.3696666666666666e-06, + "loss": 0.0, + "num_tokens": 1740494.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 5892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 37.25, + "completions/mean_terminated_length": 37.25, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 109.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03182618319988251, + "kl": 0.053854506462812424, + "learning_rate": 1.3693333333333334e-06, + "loss": 0.0027, + "num_tokens": 1740859.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 109.14814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.1318359375, + "kl": 0.2338090594857931, + "learning_rate": 1.369e-06, + "loss": 0.0629, + "num_tokens": 1741187.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 5894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 109.16666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8306169509887695, + "kl": 0.16191583452746272, + "learning_rate": 1.3686666666666667e-06, + "loss": 0.0818, + "num_tokens": 1741487.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 5895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 109.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15355105698108673, + "kl": 0.028837244026362896, + "learning_rate": 1.3683333333333335e-06, + "loss": 0.0014, + "num_tokens": 1741755.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 109.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1706068515777588, + "kl": 0.020157979801297188, + "learning_rate": 1.368e-06, + "loss": 0.001, + "num_tokens": 1742049.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 109.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000847788352984935, + "kl": 0.0037572309374809265, + "learning_rate": 1.3676666666666666e-06, + "loss": 0.0002, + "num_tokens": 1742285.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 109.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0958934798836708, + "kl": 0.027908511459827423, + "learning_rate": 1.3673333333333334e-06, + "loss": 0.0014, + "num_tokens": 1742587.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 109.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035112205892801285, + "kl": 0.006599330343306065, + "learning_rate": 1.367e-06, + "loss": 0.0003, + "num_tokens": 1742847.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 109.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02493727020919323, + "kl": 0.0010398050071671605, + "learning_rate": 1.3666666666666666e-06, + "loss": 0.0001, + "num_tokens": 1743173.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 109.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03313284367322922, + "kl": 0.007278790697455406, + "learning_rate": 1.3663333333333334e-06, + "loss": 0.0004, + "num_tokens": 1743462.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 109.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003395878477022052, + "kl": 0.000351424008840695, + "learning_rate": 1.366e-06, + "loss": 0.0, + "num_tokens": 1743734.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 109.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22905008494853973, + "kl": 0.02140538615640253, + "learning_rate": 1.3656666666666667e-06, + "loss": 0.001, + "num_tokens": 1744004.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 109.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11046409606933594, + "kl": 0.02448669052682817, + "learning_rate": 1.3653333333333335e-06, + "loss": 0.0012, + "num_tokens": 1744292.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 109.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06299640983343124, + "kl": 0.002376459538936615, + "learning_rate": 1.365e-06, + "loss": 0.0001, + "num_tokens": 1744552.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 109.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08487951755523682, + "kl": 0.00908275693655014, + "learning_rate": 1.3646666666666666e-06, + "loss": 0.0004, + "num_tokens": 1744858.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 35.5, + "completions/mean_terminated_length": 35.5, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 109.4074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1807339191436768, + "kl": 0.11271590366959572, + "learning_rate": 1.3643333333333334e-06, + "loss": 0.0134, + "num_tokens": 1745228.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 5908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 109.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022632814943790436, + "kl": 0.0023123383289203048, + "learning_rate": 1.364e-06, + "loss": 0.0001, + "num_tokens": 1745482.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 109.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07472050935029984, + "kl": 0.013483912451192737, + "learning_rate": 1.3636666666666668e-06, + "loss": 0.0008, + "num_tokens": 1745748.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 109.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07059096544981003, + "kl": 0.1631886214017868, + "learning_rate": 1.3633333333333333e-06, + "loss": 0.0082, + "num_tokens": 1746058.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 27.25, + "completions/mean_terminated_length": 27.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 109.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14876939356327057, + "kl": 0.03801564872264862, + "learning_rate": 1.363e-06, + "loss": 0.0022, + "num_tokens": 1746387.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 109.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009307836182415485, + "kl": 0.09792861342430115, + "learning_rate": 1.3626666666666667e-06, + "loss": 0.0049, + "num_tokens": 1746759.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 109.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21848231554031372, + "kl": 0.026918042451143265, + "learning_rate": 1.3623333333333335e-06, + "loss": 0.0013, + "num_tokens": 1747084.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 109.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05574426054954529, + "kl": 0.027406545355916023, + "learning_rate": 1.362e-06, + "loss": 0.0014, + "num_tokens": 1747398.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 109.55555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5493011474609375, + "kl": 0.11865761131048203, + "learning_rate": 1.3616666666666668e-06, + "loss": -0.0768, + "num_tokens": 1747670.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 5916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 109.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1623677909374237, + "kl": 0.02765484107658267, + "learning_rate": 1.3613333333333334e-06, + "loss": 0.0014, + "num_tokens": 1747999.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 109.5925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.097598075866699, + "kl": 0.07918963208794594, + "learning_rate": 1.361e-06, + "loss": 0.0532, + "num_tokens": 1748273.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 5918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 109.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024052917957305908, + "kl": 0.0011083036661148071, + "learning_rate": 1.3606666666666668e-06, + "loss": 0.0001, + "num_tokens": 1748485.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 109.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019945377483963966, + "kl": 0.05039198696613312, + "learning_rate": 1.3603333333333333e-06, + "loss": 0.0025, + "num_tokens": 1748817.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 109.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038226112723350525, + "kl": 0.0014246970458771102, + "learning_rate": 1.36e-06, + "loss": 0.0001, + "num_tokens": 1749073.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 3.75, + "completions/mean_terminated_length": 3.75, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 109.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1107323095202446, + "kl": 0.004545584321022034, + "learning_rate": 1.3596666666666667e-06, + "loss": 0.0002, + "num_tokens": 1749284.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 109.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050962068140506744, + "kl": 0.0052134746219962835, + "learning_rate": 1.3593333333333335e-06, + "loss": 0.0003, + "num_tokens": 1749556.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 109.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061520714312791824, + "kl": 0.0009494051337242126, + "learning_rate": 1.359e-06, + "loss": 0.0, + "num_tokens": 1749768.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 8.25, + "completions/mean_terminated_length": 8.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 109.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7254226803779602, + "kl": 0.06566239148378372, + "learning_rate": 1.3586666666666668e-06, + "loss": 0.0035, + "num_tokens": 1750013.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 109.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0523926205933094, + "kl": 0.001845061022322625, + "learning_rate": 1.3583333333333334e-06, + "loss": 0.0001, + "num_tokens": 1750279.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 109.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02411804534494877, + "kl": 0.009334418457001448, + "learning_rate": 1.358e-06, + "loss": 0.0005, + "num_tokens": 1750551.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 109.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3197144567966461, + "kl": 0.025151771493256092, + "learning_rate": 1.3576666666666667e-06, + "loss": 0.0017, + "num_tokens": 1750797.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 109.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20844948291778564, + "kl": 0.022541755810379982, + "learning_rate": 1.3573333333333333e-06, + "loss": 0.0011, + "num_tokens": 1751131.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 32.5, + "completions/mean_terminated_length": 32.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 109.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048450078815221786, + "kl": 0.023419609293341637, + "learning_rate": 1.3569999999999999e-06, + "loss": 0.0012, + "num_tokens": 1751485.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 109.83333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.247184991836548, + "kl": 0.030101838521659374, + "learning_rate": 1.3566666666666669e-06, + "loss": 0.2049, + "num_tokens": 1751814.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 5931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 109.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008923697285354137, + "kl": 0.0018349047750234604, + "learning_rate": 1.3563333333333334e-06, + "loss": 0.0001, + "num_tokens": 1752126.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 109.87037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0056344270706177, + "kl": 0.24381319480016828, + "learning_rate": 1.356e-06, + "loss": -0.0378, + "num_tokens": 1752449.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 5933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 109.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001068216748535633, + "kl": 0.0013050096458755434, + "learning_rate": 1.3556666666666668e-06, + "loss": 0.0001, + "num_tokens": 1752729.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 109.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03234436735510826, + "kl": 0.0012776028888765723, + "learning_rate": 1.3553333333333334e-06, + "loss": 0.0001, + "num_tokens": 1752963.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 109.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07403942197561264, + "kl": 0.004665711574489251, + "learning_rate": 1.355e-06, + "loss": 0.0002, + "num_tokens": 1753277.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 109.94444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7347073554992676, + "kl": 0.005849546520039439, + "learning_rate": 1.3546666666666667e-06, + "loss": -0.0333, + "num_tokens": 1753565.0, + "reward": 7.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.875, + "rewards/reward_combined/std": 0.25, + "step": 5937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 109.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005078271962702274, + "kl": 0.26810602843761444, + "learning_rate": 1.3543333333333333e-06, + "loss": 0.0134, + "num_tokens": 1753869.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 109.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01574855111539364, + "kl": 0.043678248301148415, + "learning_rate": 1.3539999999999999e-06, + "loss": 0.0022, + "num_tokens": 1754273.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 5939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 27.0, + "completions/mean_terminated_length": 27.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 110.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08108259737491608, + "kl": 0.012996236328035593, + "learning_rate": 1.3536666666666669e-06, + "loss": 0.0006, + "num_tokens": 1754609.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 110.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11689068377017975, + "kl": 0.026157384738326073, + "learning_rate": 1.3533333333333334e-06, + "loss": 0.0013, + "num_tokens": 1754883.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 110.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0001752555399434641, + "kl": 5.6549906730651855e-06, + "learning_rate": 1.353e-06, + "loss": 0.0, + "num_tokens": 1755103.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 5942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 110.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009663878940045834, + "kl": 0.09788351133465767, + "learning_rate": 1.3526666666666668e-06, + "loss": 0.0049, + "num_tokens": 1755475.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 110.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018762705847620964, + "kl": 0.0026635846588760614, + "learning_rate": 1.3523333333333334e-06, + "loss": 0.0001, + "num_tokens": 1755807.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 110.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06893157213926315, + "kl": 0.009115293622016907, + "learning_rate": 1.352e-06, + "loss": 0.0004, + "num_tokens": 1756086.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.0, + "completions/max_terminated_length": 46.0, + "completions/mean_length": 36.75, + "completions/mean_terminated_length": 36.75, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 110.11111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.328707695007324, + "kl": 0.1779472827911377, + "learning_rate": 1.3516666666666667e-06, + "loss": 0.0482, + "num_tokens": 1756449.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 5946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 110.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10574982315301895, + "kl": 0.006369042210280895, + "learning_rate": 1.3513333333333333e-06, + "loss": 0.0003, + "num_tokens": 1756745.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.25, + "completions/mean_terminated_length": 12.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 110.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05269327759742737, + "kl": 0.0019246204756200314, + "learning_rate": 1.3509999999999999e-06, + "loss": 0.0001, + "num_tokens": 1757006.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.0, + "completions/max_terminated_length": 46.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 110.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.5032901763916016, + "kl": 0.34883364103734493, + "learning_rate": 1.3506666666666668e-06, + "loss": 0.0195, + "num_tokens": 1757358.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 110.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0053315297700464725, + "kl": 0.00042948126792907715, + "learning_rate": 1.3503333333333334e-06, + "loss": 0.0, + "num_tokens": 1757578.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 32.0, + "completions/mean_terminated_length": 32.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 110.20370370370371, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1845754384994507, + "kl": 0.12331453897058964, + "learning_rate": 1.35e-06, + "loss": 0.0068, + "num_tokens": 1757986.0, + "reward": 1.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 1.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 5951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 110.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08301302790641785, + "kl": 0.0038667218759655952, + "learning_rate": 1.3496666666666668e-06, + "loss": 0.0002, + "num_tokens": 1758304.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 110.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032326288521289825, + "kl": 0.001139427360612899, + "learning_rate": 1.3493333333333333e-06, + "loss": 0.0001, + "num_tokens": 1758624.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 110.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03329334035515785, + "kl": 0.002319994615390897, + "learning_rate": 1.349e-06, + "loss": 0.0001, + "num_tokens": 1758884.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 110.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0033758513163775206, + "kl": 0.00046894027036614716, + "learning_rate": 1.3486666666666667e-06, + "loss": 0.0, + "num_tokens": 1759146.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 110.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006341130938380957, + "kl": 0.0007953941822052002, + "learning_rate": 1.3483333333333333e-06, + "loss": 0.0, + "num_tokens": 1759362.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 32.5, + "completions/mean_terminated_length": 32.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 110.31481481481481, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.607242465019226, + "kl": 0.16057805716991425, + "learning_rate": 1.348e-06, + "loss": 0.216, + "num_tokens": 1759728.0, + "reward": 5.375, + "reward_std": 2.462214469909668, + "rewards/reward_combined/mean": 5.375, + "rewards/reward_combined/std": 2.462214469909668, + "step": 5957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 110.33333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.060657024383545, + "kl": 0.027550682425498962, + "learning_rate": 1.3476666666666668e-06, + "loss": 0.117, + "num_tokens": 1760014.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 5958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 34.75, + "completions/mean_terminated_length": 34.75, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 110.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04181768372654915, + "kl": 0.01945724617689848, + "learning_rate": 1.3473333333333334e-06, + "loss": 0.001, + "num_tokens": 1760377.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 110.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02010614238679409, + "kl": 0.002240018220618367, + "learning_rate": 1.347e-06, + "loss": 0.0001, + "num_tokens": 1760661.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 110.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03157370537519455, + "kl": 0.0025909217074513435, + "learning_rate": 1.3466666666666668e-06, + "loss": 0.0001, + "num_tokens": 1760973.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 110.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03155425190925598, + "kl": 0.010468210093677044, + "learning_rate": 1.3463333333333333e-06, + "loss": 0.0005, + "num_tokens": 1761245.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 110.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08448175340890884, + "kl": 0.013919135555624962, + "learning_rate": 1.346e-06, + "loss": 0.0008, + "num_tokens": 1761513.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 110.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008087906171567738, + "kl": 0.003783509135246277, + "learning_rate": 1.3456666666666667e-06, + "loss": 0.0002, + "num_tokens": 1761749.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 110.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04153326526284218, + "kl": 0.0037280984688550234, + "learning_rate": 1.3453333333333333e-06, + "loss": 0.0002, + "num_tokens": 1762021.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 110.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0775628536939621, + "kl": 0.008935235207900405, + "learning_rate": 1.345e-06, + "loss": 0.0004, + "num_tokens": 1762350.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 110.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3141268491744995, + "kl": 0.024234028678620234, + "learning_rate": 1.3446666666666668e-06, + "loss": 0.0012, + "num_tokens": 1762600.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 110.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017690027132630348, + "kl": 0.0002167165366699919, + "learning_rate": 1.3443333333333334e-06, + "loss": 0.0, + "num_tokens": 1762856.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 2.5, + "completions/mean_terminated_length": 2.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 110.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009021191857755184, + "kl": 0.0004621073603630066, + "learning_rate": 1.344e-06, + "loss": 0.0, + "num_tokens": 1763062.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 110.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00440799817442894, + "kl": 0.2682085633277893, + "learning_rate": 1.3436666666666667e-06, + "loss": 0.0134, + "num_tokens": 1763366.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 110.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04712720587849617, + "kl": 0.03256791643798351, + "learning_rate": 1.3433333333333333e-06, + "loss": 0.0016, + "num_tokens": 1763666.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 110.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04081154614686966, + "kl": 0.004494331777095795, + "learning_rate": 1.343e-06, + "loss": 0.0002, + "num_tokens": 1763882.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 110.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1078043282032013, + "kl": 0.01819766405969858, + "learning_rate": 1.3426666666666667e-06, + "loss": 0.0009, + "num_tokens": 1764210.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 110.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08867350965738297, + "kl": 0.0025535154854878783, + "learning_rate": 1.3423333333333332e-06, + "loss": 0.0001, + "num_tokens": 1764480.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 36.25, + "completions/mean_terminated_length": 36.25, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 110.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20398961007595062, + "kl": 0.09276177920401096, + "learning_rate": 1.342e-06, + "loss": 0.0048, + "num_tokens": 1764853.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 110.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055068567395210266, + "kl": 0.1615331843495369, + "learning_rate": 1.3416666666666668e-06, + "loss": 0.0081, + "num_tokens": 1765163.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 110.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1508154571056366, + "kl": 0.03673155512660742, + "learning_rate": 1.3413333333333334e-06, + "loss": 0.0018, + "num_tokens": 1765462.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 110.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15242016315460205, + "kl": 0.04679614119231701, + "learning_rate": 1.3410000000000002e-06, + "loss": 0.0022, + "num_tokens": 1765776.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 26.75, + "completions/mean_terminated_length": 26.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 110.72222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.258967399597168, + "kl": 0.06232669949531555, + "learning_rate": 1.3406666666666667e-06, + "loss": 0.1485, + "num_tokens": 1766135.0, + "reward": 5.25, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 2.598076105117798, + "step": 5979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.25, + "completions/mean_terminated_length": 6.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 110.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08936386555433273, + "kl": 0.004410826601088047, + "learning_rate": 1.3403333333333333e-06, + "loss": 0.0002, + "num_tokens": 1766368.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 110.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06297900527715683, + "kl": 0.0019434280693531036, + "learning_rate": 1.34e-06, + "loss": 0.0001, + "num_tokens": 1766628.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 110.77777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1812336444854736, + "kl": 0.03287340234965086, + "learning_rate": 1.3396666666666667e-06, + "loss": 0.0081, + "num_tokens": 1766932.0, + "reward": 4.0, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 4.041451930999756, + "step": 5982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 110.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23258471488952637, + "kl": 0.0386712783947587, + "learning_rate": 1.3393333333333332e-06, + "loss": 0.0017, + "num_tokens": 1767230.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 110.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12552443146705627, + "kl": 0.010302982293069363, + "learning_rate": 1.339e-06, + "loss": 0.0005, + "num_tokens": 1767520.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 110.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3221331238746643, + "kl": 0.045488059520721436, + "learning_rate": 1.3386666666666668e-06, + "loss": 0.0023, + "num_tokens": 1767790.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 110.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02133364975452423, + "kl": 0.03848847094923258, + "learning_rate": 1.3383333333333334e-06, + "loss": 0.002, + "num_tokens": 1768082.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 110.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040582749992609024, + "kl": 0.008734130766242743, + "learning_rate": 1.3380000000000001e-06, + "loss": 0.0004, + "num_tokens": 1768421.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 26.75, + "completions/mean_terminated_length": 26.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 110.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05048838257789612, + "kl": 0.017860619351267815, + "learning_rate": 1.3376666666666667e-06, + "loss": 0.0007, + "num_tokens": 1768748.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 110.9074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.376307487487793, + "kl": 0.016093899495899677, + "learning_rate": 1.3373333333333333e-06, + "loss": 0.0145, + "num_tokens": 1769038.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 5989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 110.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019687240943312645, + "kl": 0.0004957199125783518, + "learning_rate": 1.337e-06, + "loss": 0.0, + "num_tokens": 1769251.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 5990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 110.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16235597431659698, + "kl": 0.0057592743542045355, + "learning_rate": 1.3366666666666666e-06, + "loss": 0.0003, + "num_tokens": 1769527.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 110.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03277580440044403, + "kl": 0.00822208309546113, + "learning_rate": 1.3363333333333332e-06, + "loss": 0.0004, + "num_tokens": 1769787.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 110.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02874281443655491, + "kl": 0.0011245176574448124, + "learning_rate": 1.3360000000000002e-06, + "loss": 0.0001, + "num_tokens": 1770057.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 111.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06061610206961632, + "kl": 0.003574213129468262, + "learning_rate": 1.3356666666666668e-06, + "loss": 0.0002, + "num_tokens": 1770357.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 5994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 111.01851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.414045333862305, + "kl": 0.0259128431789577, + "learning_rate": 1.3353333333333334e-06, + "loss": -0.0345, + "num_tokens": 1770633.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 5995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 32.0, + "completions/mean_terminated_length": 32.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 111.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04092200845479965, + "kl": 0.02369754295796156, + "learning_rate": 1.3350000000000001e-06, + "loss": 0.0012, + "num_tokens": 1770985.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 5996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 111.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.29867076873779297, + "kl": 0.05115535855293274, + "learning_rate": 1.3346666666666667e-06, + "loss": 0.0025, + "num_tokens": 1771303.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 5997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 111.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00025144696701318026, + "kl": 1.0229647159576416e-05, + "learning_rate": 1.3343333333333333e-06, + "loss": 0.0, + "num_tokens": 1771523.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 5998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 29.0, + "completions/mean_terminated_length": 29.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 111.0925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0722994804382324, + "kl": 0.11085102520883083, + "learning_rate": 1.334e-06, + "loss": -0.0576, + "num_tokens": 1771859.0, + "reward": 5.25, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 2.598076105117798, + "step": 5999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 111.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019805828109383583, + "kl": 0.0006284096743911505, + "learning_rate": 1.3336666666666666e-06, + "loss": 0.0, + "num_tokens": 1772094.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 111.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03412608802318573, + "kl": 0.0025367558700963855, + "learning_rate": 1.3333333333333332e-06, + "loss": 0.0001, + "num_tokens": 1772354.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 111.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009219389408826828, + "kl": 0.0018447404727339745, + "learning_rate": 1.3330000000000002e-06, + "loss": 0.0001, + "num_tokens": 1772666.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 111.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3457442820072174, + "kl": 0.036103968508541584, + "learning_rate": 1.3326666666666668e-06, + "loss": 0.0018, + "num_tokens": 1772972.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 111.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01878589764237404, + "kl": 0.0073223109357059, + "learning_rate": 1.3323333333333333e-06, + "loss": 0.0004, + "num_tokens": 1773258.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 30.25, + "completions/mean_terminated_length": 30.25, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 111.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3248395025730133, + "kl": 0.04220906086266041, + "learning_rate": 1.3320000000000001e-06, + "loss": 0.0023, + "num_tokens": 1773631.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 111.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044450752437114716, + "kl": 0.00617857207544148, + "learning_rate": 1.3316666666666667e-06, + "loss": 0.0003, + "num_tokens": 1773920.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 111.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024042680859565735, + "kl": 0.007981546688824892, + "learning_rate": 1.3313333333333333e-06, + "loss": 0.0004, + "num_tokens": 1774224.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 111.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03342939540743828, + "kl": 0.0013586650602519512, + "learning_rate": 1.331e-06, + "loss": 0.0001, + "num_tokens": 1774496.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 111.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3270973861217499, + "kl": 0.03668313066009432, + "learning_rate": 1.3306666666666666e-06, + "loss": 0.0017, + "num_tokens": 1774765.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 111.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09173467010259628, + "kl": 0.02035329419595655, + "learning_rate": 1.3303333333333332e-06, + "loss": 0.0011, + "num_tokens": 1775051.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 111.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014538324438035488, + "kl": 0.09772773459553719, + "learning_rate": 1.3300000000000002e-06, + "loss": 0.0049, + "num_tokens": 1775423.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 37.75, + "completions/mean_terminated_length": 37.75, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 111.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10422306507825851, + "kl": 0.06520166248083115, + "learning_rate": 1.3296666666666668e-06, + "loss": 0.0033, + "num_tokens": 1775802.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 38.0, + "completions/mean_terminated_length": 38.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 111.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19651436805725098, + "kl": 0.08215455524623394, + "learning_rate": 1.3293333333333333e-06, + "loss": 0.0041, + "num_tokens": 1776170.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 111.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06956064701080322, + "kl": 0.016023690346628428, + "learning_rate": 1.3290000000000001e-06, + "loss": 0.0008, + "num_tokens": 1776431.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 111.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4350983798503876, + "kl": 0.058666869066655636, + "learning_rate": 1.3286666666666667e-06, + "loss": 0.003, + "num_tokens": 1776704.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 111.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02888704463839531, + "kl": 0.0004631221236195415, + "learning_rate": 1.3283333333333333e-06, + "loss": 0.0, + "num_tokens": 1776917.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 111.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12169242650270462, + "kl": 0.026958446018397808, + "learning_rate": 1.328e-06, + "loss": 0.0013, + "num_tokens": 1777213.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 111.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06427167356014252, + "kl": 0.006387921050190926, + "learning_rate": 1.3276666666666666e-06, + "loss": 0.0003, + "num_tokens": 1777508.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 111.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14414052665233612, + "kl": 0.00976718142919708, + "learning_rate": 1.3273333333333334e-06, + "loss": 0.0004, + "num_tokens": 1777766.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 111.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021128715947270393, + "kl": 0.004333413438871503, + "learning_rate": 1.3270000000000002e-06, + "loss": 0.0002, + "num_tokens": 1778096.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 111.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02932870388031006, + "kl": 0.0011645738850347698, + "learning_rate": 1.3266666666666667e-06, + "loss": 0.0001, + "num_tokens": 1778366.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 28.0, + "completions/mean_terminated_length": 28.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 111.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.9630263447761536, + "kl": 0.18346747010946274, + "learning_rate": 1.3263333333333333e-06, + "loss": 0.0094, + "num_tokens": 1778714.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 111.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010983824729919434, + "kl": 0.001294062938541174, + "learning_rate": 1.326e-06, + "loss": 0.0001, + "num_tokens": 1779010.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 111.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0399341881275177, + "kl": 0.0019307732582092285, + "learning_rate": 1.3256666666666667e-06, + "loss": 0.0001, + "num_tokens": 1779218.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 111.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06619943678379059, + "kl": 0.011385556310415268, + "learning_rate": 1.3253333333333332e-06, + "loss": 0.0006, + "num_tokens": 1779549.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 111.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000785927870310843, + "kl": 0.0037786588072776794, + "learning_rate": 1.325e-06, + "loss": 0.0002, + "num_tokens": 1779785.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 111.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09970193356275558, + "kl": 0.006901193235535175, + "learning_rate": 1.3246666666666666e-06, + "loss": 0.0003, + "num_tokens": 1780051.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 111.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05055214837193489, + "kl": 0.011935213580727577, + "learning_rate": 1.3243333333333334e-06, + "loss": 0.0006, + "num_tokens": 1780374.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 111.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17069141566753387, + "kl": 0.013828654307872057, + "learning_rate": 1.3240000000000002e-06, + "loss": 0.0006, + "num_tokens": 1780646.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 111.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6104181408882141, + "kl": 0.040911171585321426, + "learning_rate": 1.3236666666666667e-06, + "loss": 0.0025, + "num_tokens": 1780895.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 111.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005884718149900436, + "kl": 0.00035889819264411926, + "learning_rate": 1.3233333333333333e-06, + "loss": 0.0, + "num_tokens": 1781155.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 111.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06621158123016357, + "kl": 0.001562038087286055, + "learning_rate": 1.323e-06, + "loss": 0.0001, + "num_tokens": 1781411.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 111.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2523360252380371, + "kl": 0.29302704334259033, + "learning_rate": 1.3226666666666667e-06, + "loss": 0.0147, + "num_tokens": 1781716.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 111.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022643107920885086, + "kl": 0.002543122856877744, + "learning_rate": 1.3223333333333334e-06, + "loss": 0.0001, + "num_tokens": 1781995.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.25, + "completions/mean_terminated_length": 5.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 111.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21131780743598938, + "kl": 0.012039016000926495, + "learning_rate": 1.322e-06, + "loss": 0.0007, + "num_tokens": 1782216.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 111.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07311871647834778, + "kl": 0.008500882424414158, + "learning_rate": 1.3216666666666666e-06, + "loss": 0.0004, + "num_tokens": 1782484.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 111.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022241367027163506, + "kl": 0.0029014392057433724, + "learning_rate": 1.3213333333333334e-06, + "loss": 0.0001, + "num_tokens": 1782790.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 30.5, + "completions/mean_terminated_length": 30.5, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 111.81481481481481, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3976516723632812, + "kl": 0.5538979358971119, + "learning_rate": 1.3210000000000001e-06, + "loss": 0.0091, + "num_tokens": 1783192.0, + "reward": 2.375, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 1.25, + "step": 6038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 111.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019791917875409126, + "kl": 0.002384623629041016, + "learning_rate": 1.3206666666666667e-06, + "loss": 0.0001, + "num_tokens": 1783476.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.75, + "completions/mean_terminated_length": 23.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 111.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0845181792974472, + "kl": 0.03165253438055515, + "learning_rate": 1.3203333333333335e-06, + "loss": 0.0016, + "num_tokens": 1783791.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 111.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1435367912054062, + "kl": 0.011215799488127232, + "learning_rate": 1.32e-06, + "loss": 0.0007, + "num_tokens": 1784010.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 111.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027536695823073387, + "kl": 0.0017975717782974243, + "learning_rate": 1.3196666666666666e-06, + "loss": 0.0001, + "num_tokens": 1784222.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 111.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011715343222022057, + "kl": 0.0004479595518205315, + "learning_rate": 1.3193333333333334e-06, + "loss": 0.0, + "num_tokens": 1784539.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 111.92592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.094869613647461, + "kl": 0.017732942011207342, + "learning_rate": 1.319e-06, + "loss": 0.2422, + "num_tokens": 1784816.0, + "reward": 7.0, + "reward_std": 1.0, + "rewards/reward_combined/mean": 7.0, + "rewards/reward_combined/std": 1.0, + "step": 6044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 111.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.30865153670310974, + "kl": 0.204693503677845, + "learning_rate": 1.3186666666666666e-06, + "loss": 0.0102, + "num_tokens": 1785124.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 111.96296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.185741901397705, + "kl": 0.23800479620695114, + "learning_rate": 1.3183333333333333e-06, + "loss": 0.1464, + "num_tokens": 1785437.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 6046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 24.75, + "completions/mean_terminated_length": 24.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 111.98148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.094146728515625, + "kl": 0.0469056311994791, + "learning_rate": 1.3180000000000001e-06, + "loss": 0.0487, + "num_tokens": 1785764.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 6047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 112.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005997946485877037, + "kl": 0.0006072355608921498, + "learning_rate": 1.3176666666666667e-06, + "loss": 0.0, + "num_tokens": 1786076.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 112.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07137975096702576, + "kl": 0.16442856192588806, + "learning_rate": 1.3173333333333335e-06, + "loss": 0.0082, + "num_tokens": 1786385.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 112.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0800158903002739, + "kl": 0.013670348562300205, + "learning_rate": 1.317e-06, + "loss": 0.0007, + "num_tokens": 1786645.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 112.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018708205607254058, + "kl": 6.757676601409912e-06, + "learning_rate": 1.3166666666666666e-06, + "loss": 0.0, + "num_tokens": 1786865.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 6051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 112.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00198156270198524, + "kl": 9.253621101379395e-05, + "learning_rate": 1.3163333333333334e-06, + "loss": 0.0, + "num_tokens": 1787108.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 112.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004433844238519669, + "kl": 0.0003328956663608551, + "learning_rate": 1.316e-06, + "loss": 0.0, + "num_tokens": 1787368.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.25, + "completions/mean_terminated_length": 6.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 112.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01701241359114647, + "kl": 0.0007004120707279071, + "learning_rate": 1.3156666666666665e-06, + "loss": 0.0, + "num_tokens": 1787601.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 112.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16796699166297913, + "kl": 0.01005538646131754, + "learning_rate": 1.3153333333333335e-06, + "loss": 0.0006, + "num_tokens": 1787820.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 112.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061234571039676666, + "kl": 0.011406462639570236, + "learning_rate": 1.3150000000000001e-06, + "loss": 0.0006, + "num_tokens": 1788151.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 112.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004983628634363413, + "kl": 4.785507917404175e-05, + "learning_rate": 1.3146666666666667e-06, + "loss": 0.0, + "num_tokens": 1788363.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 112.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08072728663682938, + "kl": 0.05556000769138336, + "learning_rate": 1.3143333333333335e-06, + "loss": 0.0028, + "num_tokens": 1788657.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 32.5, + "completions/mean_terminated_length": 32.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 112.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05396899953484535, + "kl": 0.010494334623217583, + "learning_rate": 1.314e-06, + "loss": 0.0005, + "num_tokens": 1789007.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 34.25, + "completions/mean_terminated_length": 34.25, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 112.22222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.543977737426758, + "kl": 0.12582920491695404, + "learning_rate": 1.3136666666666666e-06, + "loss": -0.0717, + "num_tokens": 1789372.0, + "reward": 5.75, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 2.598076105117798, + "step": 6060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 112.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009092212421819568, + "kl": 0.0037513896822929382, + "learning_rate": 1.3133333333333334e-06, + "loss": 0.0002, + "num_tokens": 1789608.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 39.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 39.0, + "completions/mean_terminated_length": 39.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 112.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5136048197746277, + "kl": 0.09922398626804352, + "learning_rate": 1.313e-06, + "loss": 0.005, + "num_tokens": 1789980.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 34.0, + "completions/mean_terminated_length": 34.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 112.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06007806956768036, + "kl": 0.025501039810478687, + "learning_rate": 1.3126666666666665e-06, + "loss": 0.0013, + "num_tokens": 1790340.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 112.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05802278593182564, + "kl": 0.008447068918030709, + "learning_rate": 1.3123333333333335e-06, + "loss": 0.0004, + "num_tokens": 1790608.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 112.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03410980477929115, + "kl": 0.005437546409666538, + "learning_rate": 1.312e-06, + "loss": 0.0003, + "num_tokens": 1790896.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 112.33333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.564185380935669, + "kl": 0.15899060387164354, + "learning_rate": 1.3116666666666667e-06, + "loss": -0.0286, + "num_tokens": 1791185.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 6066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 112.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039753057062625885, + "kl": 0.0014929691096767783, + "learning_rate": 1.3113333333333335e-06, + "loss": 0.0001, + "num_tokens": 1791507.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 36.5, + "completions/mean_terminated_length": 36.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 112.37037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.200249671936035, + "kl": 0.02246608305722475, + "learning_rate": 1.311e-06, + "loss": 0.1129, + "num_tokens": 1791881.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 6068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 112.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04710720106959343, + "kl": 0.0030373672489076853, + "learning_rate": 1.3106666666666666e-06, + "loss": 0.0002, + "num_tokens": 1792177.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 112.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10301186889410019, + "kl": 0.016376479528844357, + "learning_rate": 1.3103333333333334e-06, + "loss": 0.0008, + "num_tokens": 1792473.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 112.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03799111023545265, + "kl": 0.0005753666337113827, + "learning_rate": 1.31e-06, + "loss": 0.0, + "num_tokens": 1792729.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 112.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1706431359052658, + "kl": 0.04853484034538269, + "learning_rate": 1.3096666666666665e-06, + "loss": 0.0024, + "num_tokens": 1793029.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 24.25, + "completions/mean_terminated_length": 24.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 112.46296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.545912742614746, + "kl": 0.23079244047403336, + "learning_rate": 1.3093333333333335e-06, + "loss": 0.0285, + "num_tokens": 1793362.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 6073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 112.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20008555054664612, + "kl": 0.029866354539990425, + "learning_rate": 1.309e-06, + "loss": 0.0018, + "num_tokens": 1793654.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 112.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007798391743563116, + "kl": 0.0012585946824401617, + "learning_rate": 1.3086666666666667e-06, + "loss": 0.0001, + "num_tokens": 1793934.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 112.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09289143979549408, + "kl": 0.013444689102470875, + "learning_rate": 1.3083333333333334e-06, + "loss": 0.0007, + "num_tokens": 1794274.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 112.53703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.097061634063721, + "kl": 0.01800231065135449, + "learning_rate": 1.308e-06, + "loss": 0.0549, + "num_tokens": 1794550.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 6077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 112.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0736713632941246, + "kl": 0.005710856756195426, + "learning_rate": 1.3076666666666666e-06, + "loss": 0.0003, + "num_tokens": 1794814.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 112.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041961751878261566, + "kl": 0.0032766188960522413, + "learning_rate": 1.3073333333333334e-06, + "loss": 0.0002, + "num_tokens": 1795144.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 23.25, + "completions/mean_terminated_length": 23.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 112.5925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2190539836883545, + "kl": 0.014457188313826919, + "learning_rate": 1.307e-06, + "loss": 0.0532, + "num_tokens": 1795481.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 6080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 112.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037971947342157364, + "kl": 0.006282295798882842, + "learning_rate": 1.3066666666666667e-06, + "loss": 0.0003, + "num_tokens": 1795749.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 9.75, + "completions/mean_terminated_length": 9.75, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 112.62962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4320151805877686, + "kl": 0.07949055475182831, + "learning_rate": 1.3063333333333335e-06, + "loss": 0.0763, + "num_tokens": 1796012.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 6082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 112.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05311053991317749, + "kl": 0.0035557467490434647, + "learning_rate": 1.306e-06, + "loss": 0.0002, + "num_tokens": 1796326.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 24.25, + "completions/mean_terminated_length": 24.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 112.66666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.467212200164795, + "kl": 0.028736325912177563, + "learning_rate": 1.3056666666666666e-06, + "loss": 0.0267, + "num_tokens": 1796675.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 6084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 112.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014098790474236012, + "kl": 0.09716768935322762, + "learning_rate": 1.3053333333333334e-06, + "loss": 0.0049, + "num_tokens": 1797047.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 112.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07768841087818146, + "kl": 0.052976781502366066, + "learning_rate": 1.305e-06, + "loss": 0.0026, + "num_tokens": 1797319.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 112.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04761912301182747, + "kl": 0.0012773325142916292, + "learning_rate": 1.3046666666666666e-06, + "loss": 0.0, + "num_tokens": 1797539.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 112.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13366056978702545, + "kl": 0.02104736864566803, + "learning_rate": 1.3043333333333334e-06, + "loss": 0.0011, + "num_tokens": 1797849.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 112.75925925925925, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.6834845542907715, + "kl": 0.2582798183429986, + "learning_rate": 1.304e-06, + "loss": 0.0091, + "num_tokens": 1798117.0, + "reward": 4.5, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 4.041451930999756, + "step": 6089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 31.75, + "completions/mean_terminated_length": 31.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 112.77777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7414571046829224, + "kl": 0.22417202312499285, + "learning_rate": 1.3036666666666667e-06, + "loss": 0.0267, + "num_tokens": 1798524.0, + "reward": 2.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.25, + "rewards/reward_combined/std": 1.5, + "step": 6090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 112.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035348184406757355, + "kl": 0.00517903221771121, + "learning_rate": 1.3033333333333335e-06, + "loss": 0.0002, + "num_tokens": 1798824.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 112.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0469917356967926, + "kl": 0.001685740160610294, + "learning_rate": 1.303e-06, + "loss": 0.0001, + "num_tokens": 1799096.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 5.25, + "completions/mean_terminated_length": 5.25, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 112.83333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.0281081199646, + "kl": 0.04458103032084182, + "learning_rate": 1.3026666666666666e-06, + "loss": 0.053, + "num_tokens": 1799317.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 6093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 112.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022530920803546906, + "kl": 0.0019829481607303023, + "learning_rate": 1.3023333333333334e-06, + "loss": 0.0001, + "num_tokens": 1799577.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 112.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005209250375628471, + "kl": 0.26801739633083344, + "learning_rate": 1.302e-06, + "loss": 0.0134, + "num_tokens": 1799881.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 112.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020403945818543434, + "kl": 0.0027150855166837573, + "learning_rate": 1.3016666666666668e-06, + "loss": 0.0001, + "num_tokens": 1800163.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 112.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00602941308170557, + "kl": 0.000691894005285576, + "learning_rate": 1.3013333333333333e-06, + "loss": 0.0, + "num_tokens": 1800475.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 112.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03966996446251869, + "kl": 0.0006224736571311951, + "learning_rate": 1.301e-06, + "loss": 0.0, + "num_tokens": 1800685.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 112.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057423096150159836, + "kl": 0.013602379709482193, + "learning_rate": 1.3006666666666667e-06, + "loss": 0.0007, + "num_tokens": 1800987.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 112.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050531454384326935, + "kl": 0.005418589920736849, + "learning_rate": 1.3003333333333335e-06, + "loss": 0.0003, + "num_tokens": 1801249.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 112.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04474520683288574, + "kl": 0.008686012821272016, + "learning_rate": 1.3e-06, + "loss": 0.0004, + "num_tokens": 1801529.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 113.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15142762660980225, + "kl": 0.04436723701655865, + "learning_rate": 1.2996666666666668e-06, + "loss": 0.0023, + "num_tokens": 1801849.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 113.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02884383127093315, + "kl": 0.0018276572227478027, + "learning_rate": 1.2993333333333334e-06, + "loss": 0.0001, + "num_tokens": 1802061.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 23.75, + "completions/mean_terminated_length": 23.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 113.03703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2334437370300293, + "kl": 0.2099862964823842, + "learning_rate": 1.299e-06, + "loss": 0.0022, + "num_tokens": 1802408.0, + "reward": 5.75, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 4.5, + "step": 6104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 113.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07001669704914093, + "kl": 0.026877841912209988, + "learning_rate": 1.2986666666666668e-06, + "loss": 0.0015, + "num_tokens": 1802698.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 113.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053630854934453964, + "kl": 0.004849277785979211, + "learning_rate": 1.2983333333333333e-06, + "loss": 0.0003, + "num_tokens": 1802971.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 113.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05525628849864006, + "kl": 0.04509176127612591, + "learning_rate": 1.298e-06, + "loss": 0.0023, + "num_tokens": 1803375.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 6107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 113.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0272380281239748, + "kl": 0.004748962353914976, + "learning_rate": 1.2976666666666667e-06, + "loss": 0.0002, + "num_tokens": 1803643.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 113.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06247115880250931, + "kl": 0.002168981940485537, + "learning_rate": 1.2973333333333335e-06, + "loss": 0.0001, + "num_tokens": 1803939.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 2.5, + "completions/mean_terminated_length": 2.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 113.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03162141516804695, + "kl": 0.0010609924793243408, + "learning_rate": 1.297e-06, + "loss": 0.0, + "num_tokens": 1804145.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 113.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16321514546871185, + "kl": 0.01150759935262613, + "learning_rate": 1.2966666666666668e-06, + "loss": 0.0007, + "num_tokens": 1804385.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 113.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046190064400434494, + "kl": 0.03093954734504223, + "learning_rate": 1.2963333333333334e-06, + "loss": 0.0015, + "num_tokens": 1804729.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 113.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3360211253166199, + "kl": 0.030782817862927914, + "learning_rate": 1.296e-06, + "loss": 0.0017, + "num_tokens": 1804994.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 113.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5796241760253906, + "kl": 0.09935360588133335, + "learning_rate": 1.2956666666666667e-06, + "loss": 0.0051, + "num_tokens": 1805293.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 113.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017644623294472694, + "kl": 0.002459119656123221, + "learning_rate": 1.2953333333333333e-06, + "loss": 0.0001, + "num_tokens": 1805575.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 113.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05419870465993881, + "kl": 0.16157615184783936, + "learning_rate": 1.2949999999999999e-06, + "loss": 0.0081, + "num_tokens": 1805885.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 29.5, + "completions/mean_terminated_length": 29.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 113.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21626430749893188, + "kl": 0.029798878356814384, + "learning_rate": 1.2946666666666669e-06, + "loss": 0.0014, + "num_tokens": 1806231.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 113.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0877583920955658, + "kl": 0.012825872283428907, + "learning_rate": 1.2943333333333334e-06, + "loss": 0.0006, + "num_tokens": 1806522.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 113.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02765658311545849, + "kl": 0.0002564266324043274, + "learning_rate": 1.294e-06, + "loss": 0.0, + "num_tokens": 1806734.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 113.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003085933392867446, + "kl": 0.00047844648361206055, + "learning_rate": 1.2936666666666668e-06, + "loss": 0.0, + "num_tokens": 1806994.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 113.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1523253470659256, + "kl": 0.04721454158425331, + "learning_rate": 1.2933333333333334e-06, + "loss": 0.0024, + "num_tokens": 1807265.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 113.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.27010905742645264, + "kl": 0.033112211152911186, + "learning_rate": 1.293e-06, + "loss": 0.002, + "num_tokens": 1807541.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 113.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6834784150123596, + "kl": 0.060694653540849686, + "learning_rate": 1.2926666666666667e-06, + "loss": 0.0033, + "num_tokens": 1807803.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 113.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007319572032429278, + "kl": 0.0037931501865386963, + "learning_rate": 1.2923333333333333e-06, + "loss": 0.0002, + "num_tokens": 1808039.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 113.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01870492659509182, + "kl": 0.0021220995113253593, + "learning_rate": 1.2919999999999999e-06, + "loss": 0.0001, + "num_tokens": 1808316.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 113.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05359676480293274, + "kl": 0.005423239199444652, + "learning_rate": 1.2916666666666669e-06, + "loss": 0.0003, + "num_tokens": 1808604.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 113.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018162034451961517, + "kl": 0.0004427611784194596, + "learning_rate": 1.2913333333333334e-06, + "loss": 0.0, + "num_tokens": 1808874.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 113.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08776697516441345, + "kl": 0.013297136407345533, + "learning_rate": 1.291e-06, + "loss": 0.0006, + "num_tokens": 1809167.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 113.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.233470916748047, + "kl": 0.04257943370612338, + "learning_rate": 1.2906666666666668e-06, + "loss": 0.0267, + "num_tokens": 1809495.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 6129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 113.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021943939849734306, + "kl": 0.0002798199711833149, + "learning_rate": 1.2903333333333334e-06, + "loss": 0.0, + "num_tokens": 1809751.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 113.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.470475673675537, + "kl": 0.4380806051194668, + "learning_rate": 1.29e-06, + "loss": 0.0239, + "num_tokens": 1810055.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 113.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009376843459904194, + "kl": 0.009066774509847164, + "learning_rate": 1.2896666666666667e-06, + "loss": 0.0005, + "num_tokens": 1810327.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 113.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04478984698653221, + "kl": 0.0022650300234090537, + "learning_rate": 1.2893333333333333e-06, + "loss": 0.0001, + "num_tokens": 1810641.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 113.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.146836519241333, + "kl": 0.012147336732596159, + "learning_rate": 1.2889999999999999e-06, + "loss": 0.0006, + "num_tokens": 1810906.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 113.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020878897979855537, + "kl": 0.01223933044821024, + "learning_rate": 1.2886666666666669e-06, + "loss": 0.0006, + "num_tokens": 1811166.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 113.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10959409177303314, + "kl": 0.031592690385878086, + "learning_rate": 1.2883333333333334e-06, + "loss": 0.0017, + "num_tokens": 1811489.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 113.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1435224711894989, + "kl": 0.009896425995975733, + "learning_rate": 1.288e-06, + "loss": 0.0005, + "num_tokens": 1811793.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 113.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005696768872439861, + "kl": 0.267911359667778, + "learning_rate": 1.2876666666666668e-06, + "loss": 0.0134, + "num_tokens": 1812097.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 113.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.29938679933547974, + "kl": 0.027862831600941718, + "learning_rate": 1.2873333333333333e-06, + "loss": 0.0015, + "num_tokens": 1812426.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 113.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1182071641087532, + "kl": 0.059394070878624916, + "learning_rate": 1.287e-06, + "loss": 0.003, + "num_tokens": 1812766.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 113.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00019236697698943317, + "kl": 6.802380084991455e-06, + "learning_rate": 1.2866666666666667e-06, + "loss": 0.0, + "num_tokens": 1812986.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 6141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 113.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15949071943759918, + "kl": 0.022955283522605896, + "learning_rate": 1.2863333333333333e-06, + "loss": 0.0012, + "num_tokens": 1813284.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 113.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006175547372549772, + "kl": 0.0015174001455307007, + "learning_rate": 1.286e-06, + "loss": 0.0001, + "num_tokens": 1813500.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 113.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056358322501182556, + "kl": 0.003626002697274089, + "learning_rate": 1.2856666666666668e-06, + "loss": 0.0002, + "num_tokens": 1813829.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 113.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003910984843969345, + "kl": 0.00014029815793037415, + "learning_rate": 1.2853333333333334e-06, + "loss": 0.0, + "num_tokens": 1814073.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 113.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02486460469663143, + "kl": 0.0015810569748282433, + "learning_rate": 1.285e-06, + "loss": 0.0001, + "num_tokens": 1814385.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 113.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014391753822565079, + "kl": 0.09706833586096764, + "learning_rate": 1.2846666666666668e-06, + "loss": 0.0049, + "num_tokens": 1814757.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 113.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004465989302843809, + "kl": 0.00036110280780121684, + "learning_rate": 1.2843333333333333e-06, + "loss": 0.0, + "num_tokens": 1814977.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 41.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 41.25, + "completions/mean_terminated_length": 41.25, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 113.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.31261640787124634, + "kl": 0.07626931555569172, + "learning_rate": 1.284e-06, + "loss": 0.0041, + "num_tokens": 1815366.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 113.88888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3386521339416504, + "kl": 0.041719175642356277, + "learning_rate": 1.2836666666666667e-06, + "loss": -0.0635, + "num_tokens": 1815650.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 6150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 36.25, + "completions/mean_terminated_length": 36.25, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 113.9074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2824010848999023, + "kl": 0.061978865414857864, + "learning_rate": 1.2833333333333333e-06, + "loss": -0.018, + "num_tokens": 1816023.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 6151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 40.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 40.0, + "completions/mean_terminated_length": 40.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 113.92592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.945875883102417, + "kl": 0.11103121191263199, + "learning_rate": 1.283e-06, + "loss": -0.0392, + "num_tokens": 1816403.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 6152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 113.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032043103128671646, + "kl": 0.001565319747896865, + "learning_rate": 1.2826666666666668e-06, + "loss": 0.0001, + "num_tokens": 1816679.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 113.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04927992820739746, + "kl": 0.010830877348780632, + "learning_rate": 1.2823333333333334e-06, + "loss": 0.0005, + "num_tokens": 1816998.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 113.98148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1662204265594482, + "kl": 0.13761737197637558, + "learning_rate": 1.282e-06, + "loss": -0.076, + "num_tokens": 1817339.0, + "reward": 3.375, + "reward_std": 3.3008837699890137, + "rewards/reward_combined/mean": 3.375, + "rewards/reward_combined/std": 3.3008837699890137, + "step": 6155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 114.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5782065391540527, + "kl": 0.5165721024386585, + "learning_rate": 1.2816666666666667e-06, + "loss": 0.1623, + "num_tokens": 1817604.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 6156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 114.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028663193807005882, + "kl": 0.00029393285512924194, + "learning_rate": 1.2813333333333333e-06, + "loss": 0.0, + "num_tokens": 1817816.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 114.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05983349308371544, + "kl": 0.011977674905210733, + "learning_rate": 1.281e-06, + "loss": 0.0006, + "num_tokens": 1818139.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 114.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026303723454475403, + "kl": 0.0024897477123886347, + "learning_rate": 1.2806666666666667e-06, + "loss": 0.0001, + "num_tokens": 1818419.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 114.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039252396672964096, + "kl": 0.0016693869838491082, + "learning_rate": 1.2803333333333332e-06, + "loss": 0.0001, + "num_tokens": 1818687.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 35.0, + "completions/mean_terminated_length": 35.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 114.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05376654863357544, + "kl": 0.024469844065606594, + "learning_rate": 1.28e-06, + "loss": 0.0012, + "num_tokens": 1819051.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 114.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05036499351263046, + "kl": 0.006932688876986504, + "learning_rate": 1.2796666666666668e-06, + "loss": 0.0003, + "num_tokens": 1819342.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 114.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004733308218419552, + "kl": 0.0003312766639282927, + "learning_rate": 1.2793333333333334e-06, + "loss": 0.0, + "num_tokens": 1819562.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 114.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0364217534661293, + "kl": 0.004737072857096791, + "learning_rate": 1.2790000000000002e-06, + "loss": 0.0002, + "num_tokens": 1819850.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 27.0, + "completions/mean_terminated_length": 27.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 114.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10991999506950378, + "kl": 0.03192046098411083, + "learning_rate": 1.2786666666666667e-06, + "loss": 0.0016, + "num_tokens": 1820178.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 114.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023190563544631004, + "kl": 0.002787799807265401, + "learning_rate": 1.2783333333333333e-06, + "loss": 0.0001, + "num_tokens": 1820460.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 114.20370370370371, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.263566970825195, + "kl": 0.08319773897528648, + "learning_rate": 1.278e-06, + "loss": -0.0329, + "num_tokens": 1820762.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 6167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 114.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015998879447579384, + "kl": 0.003295719623565674, + "learning_rate": 1.2776666666666667e-06, + "loss": 0.0002, + "num_tokens": 1821066.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 114.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020607894111890346, + "kl": 8.180737495422363e-06, + "learning_rate": 1.2773333333333332e-06, + "loss": 0.0, + "num_tokens": 1821286.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 6169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 114.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02200189046561718, + "kl": 0.012004107236862183, + "learning_rate": 1.277e-06, + "loss": 0.0006, + "num_tokens": 1821546.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 114.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4950074851512909, + "kl": 0.09659002721309662, + "learning_rate": 1.2766666666666668e-06, + "loss": 0.0048, + "num_tokens": 1821818.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 24.75, + "completions/mean_terminated_length": 24.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 114.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24328814446926117, + "kl": 0.04084146022796631, + "learning_rate": 1.2763333333333334e-06, + "loss": 0.0021, + "num_tokens": 1822137.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 114.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006257086992263794, + "kl": 0.0018291417509317398, + "learning_rate": 1.2760000000000001e-06, + "loss": 0.0001, + "num_tokens": 1822449.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 114.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053064122796058655, + "kl": 0.0031415367411682382, + "learning_rate": 1.2756666666666667e-06, + "loss": 0.0002, + "num_tokens": 1822698.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 114.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015413051471114159, + "kl": 0.0016451667761430144, + "learning_rate": 1.2753333333333333e-06, + "loss": 0.0001, + "num_tokens": 1822994.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 114.37037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.064087390899658, + "kl": 0.2727709859609604, + "learning_rate": 1.275e-06, + "loss": 0.0001, + "num_tokens": 1823297.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 6176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 114.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.6945916414260864, + "kl": 0.11913560517132282, + "learning_rate": 1.2746666666666666e-06, + "loss": 0.006, + "num_tokens": 1823557.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 114.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06170351058244705, + "kl": 0.007386990590021014, + "learning_rate": 1.2743333333333332e-06, + "loss": 0.0004, + "num_tokens": 1823860.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 29.0, + "completions/mean_terminated_length": 29.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 114.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19605782628059387, + "kl": 0.019883667584508657, + "learning_rate": 1.2740000000000002e-06, + "loss": 0.0011, + "num_tokens": 1824204.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 114.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03629698604345322, + "kl": 0.008807201404124498, + "learning_rate": 1.2736666666666668e-06, + "loss": 0.0004, + "num_tokens": 1824488.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 114.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13570334017276764, + "kl": 0.02256380021572113, + "learning_rate": 1.2733333333333334e-06, + "loss": 0.0014, + "num_tokens": 1824766.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 37.25, + "completions/mean_terminated_length": 37.25, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 114.48148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.176393985748291, + "kl": 0.12488642707467079, + "learning_rate": 1.2730000000000001e-06, + "loss": 0.0306, + "num_tokens": 1825143.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 6182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 114.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02194606512784958, + "kl": 0.000545364135177806, + "learning_rate": 1.2726666666666667e-06, + "loss": 0.0, + "num_tokens": 1825411.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 114.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02322235144674778, + "kl": 0.00403057795483619, + "learning_rate": 1.2723333333333333e-06, + "loss": 0.0002, + "num_tokens": 1825741.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 114.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02036534622311592, + "kl": 0.0009313449263572693, + "learning_rate": 1.272e-06, + "loss": 0.0, + "num_tokens": 1826001.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 114.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5926064848899841, + "kl": 0.06984907109290361, + "learning_rate": 1.2716666666666666e-06, + "loss": 0.0038, + "num_tokens": 1826325.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 114.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16381967067718506, + "kl": 0.11497493088245392, + "learning_rate": 1.2713333333333332e-06, + "loss": 0.0057, + "num_tokens": 1826697.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 114.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03609957918524742, + "kl": 0.003011562628671527, + "learning_rate": 1.2710000000000002e-06, + "loss": 0.0002, + "num_tokens": 1826969.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 114.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059980399906635284, + "kl": 0.03339464124292135, + "learning_rate": 1.2706666666666668e-06, + "loss": 0.0017, + "num_tokens": 1827295.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 114.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08144175261259079, + "kl": 0.030467216856777668, + "learning_rate": 1.2703333333333333e-06, + "loss": 0.0015, + "num_tokens": 1827605.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 114.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007697722758166492, + "kl": 0.0037793144583702087, + "learning_rate": 1.2700000000000001e-06, + "loss": 0.0002, + "num_tokens": 1827841.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 114.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005531348753720522, + "kl": 0.0004951953742420301, + "learning_rate": 1.2696666666666667e-06, + "loss": 0.0, + "num_tokens": 1828101.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 114.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09417324513196945, + "kl": 0.009912568144500256, + "learning_rate": 1.2693333333333333e-06, + "loss": 0.0005, + "num_tokens": 1828435.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 114.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055066537111997604, + "kl": 0.04799327440559864, + "learning_rate": 1.269e-06, + "loss": 0.0024, + "num_tokens": 1828839.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 6194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 114.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09334663301706314, + "kl": 0.012137975078076124, + "learning_rate": 1.2686666666666666e-06, + "loss": 0.0006, + "num_tokens": 1829130.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 114.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01382592786103487, + "kl": 0.0015658079646527767, + "learning_rate": 1.2683333333333332e-06, + "loss": 0.0001, + "num_tokens": 1829458.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.75, + "completions/mean_terminated_length": 11.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 114.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3998969495296478, + "kl": 0.03045146632939577, + "learning_rate": 1.2680000000000002e-06, + "loss": 0.0015, + "num_tokens": 1829717.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 114.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028034979477524757, + "kl": 0.0395667664706707, + "learning_rate": 1.2676666666666668e-06, + "loss": 0.002, + "num_tokens": 1830009.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 114.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019692983478307724, + "kl": 0.0013635685172630474, + "learning_rate": 1.2673333333333333e-06, + "loss": 0.0001, + "num_tokens": 1830320.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 114.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14744527637958527, + "kl": 0.016954160062596202, + "learning_rate": 1.2670000000000001e-06, + "loss": 0.0008, + "num_tokens": 1830580.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 114.83333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.291611433029175, + "kl": 0.015495841391384602, + "learning_rate": 1.2666666666666667e-06, + "loss": -0.1883, + "num_tokens": 1830868.0, + "reward": 7.625, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.625, + "rewards/reward_combined/std": 0.25, + "step": 6201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 114.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014595337212085724, + "kl": 0.0006506634672405198, + "learning_rate": 1.2663333333333333e-06, + "loss": 0.0, + "num_tokens": 1831103.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 114.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045938242226839066, + "kl": 0.16250810772180557, + "learning_rate": 1.266e-06, + "loss": 0.0081, + "num_tokens": 1831412.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 114.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2536752223968506, + "kl": 0.02076407801359892, + "learning_rate": 1.2656666666666666e-06, + "loss": 0.0009, + "num_tokens": 1831678.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 114.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0022744471207261086, + "kl": 0.0013867318630218506, + "learning_rate": 1.2653333333333334e-06, + "loss": 0.0001, + "num_tokens": 1831894.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 114.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01388757023960352, + "kl": 0.00033307820558547974, + "learning_rate": 1.2650000000000002e-06, + "loss": 0.0, + "num_tokens": 1832102.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 114.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027198635041713715, + "kl": 0.000445952988229692, + "learning_rate": 1.2646666666666667e-06, + "loss": 0.0, + "num_tokens": 1832358.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 38.0, + "completions/mean_terminated_length": 38.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 114.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05515242740511894, + "kl": 0.04820450767874718, + "learning_rate": 1.2643333333333333e-06, + "loss": 0.0024, + "num_tokens": 1832726.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 114.98148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0941758155822754, + "kl": 0.14915595948696136, + "learning_rate": 1.264e-06, + "loss": -0.0212, + "num_tokens": 1833063.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 6209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 115.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004696240648627281, + "kl": 0.0011134495434816927, + "learning_rate": 1.2636666666666667e-06, + "loss": 0.0001, + "num_tokens": 1833279.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 115.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.094312384724617, + "kl": 0.04079294204711914, + "learning_rate": 1.2633333333333332e-06, + "loss": 0.002, + "num_tokens": 1833553.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 34.5, + "completions/mean_terminated_length": 34.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 115.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03498869761824608, + "kl": 0.024879327043890953, + "learning_rate": 1.263e-06, + "loss": 0.0013, + "num_tokens": 1833915.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 36.0, + "completions/mean_terminated_length": 36.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 115.05555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8750035762786865, + "kl": 0.0767012257128954, + "learning_rate": 1.2626666666666666e-06, + "loss": -0.1228, + "num_tokens": 1834275.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 6213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 115.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04909810796380043, + "kl": 0.015334242023527622, + "learning_rate": 1.2623333333333334e-06, + "loss": 0.0008, + "num_tokens": 1834585.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 115.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007737711071968079, + "kl": 0.007780902087688446, + "learning_rate": 1.2620000000000002e-06, + "loss": 0.0004, + "num_tokens": 1834857.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 115.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005330335348844528, + "kl": 0.00037394398532342166, + "learning_rate": 1.2616666666666667e-06, + "loss": 0.0, + "num_tokens": 1835171.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 115.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021202364936470985, + "kl": 0.0016954689635895193, + "learning_rate": 1.2613333333333333e-06, + "loss": 0.0001, + "num_tokens": 1835431.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 27.5, + "completions/mean_terminated_length": 27.5, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 115.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0466022789478302, + "kl": 0.019457083195447922, + "learning_rate": 1.261e-06, + "loss": 0.001, + "num_tokens": 1835793.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 115.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009932305663824081, + "kl": 0.0006368197500705719, + "learning_rate": 1.2606666666666667e-06, + "loss": 0.0, + "num_tokens": 1836055.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 115.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012711938470602036, + "kl": 0.0022020963951945305, + "learning_rate": 1.2603333333333334e-06, + "loss": 0.0001, + "num_tokens": 1836367.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 115.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04842125251889229, + "kl": 0.0027758406940847635, + "learning_rate": 1.26e-06, + "loss": 0.0001, + "num_tokens": 1836695.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 115.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7433775067329407, + "kl": 0.03612457067356445, + "learning_rate": 1.2596666666666666e-06, + "loss": 0.0018, + "num_tokens": 1836951.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 115.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012128106318414211, + "kl": 8.714944124221802e-05, + "learning_rate": 1.2593333333333334e-06, + "loss": 0.0, + "num_tokens": 1837195.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 27.5, + "completions/mean_terminated_length": 27.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 115.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12190528213977814, + "kl": 0.058578457683324814, + "learning_rate": 1.2590000000000001e-06, + "loss": 0.0029, + "num_tokens": 1837541.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 115.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06681372225284576, + "kl": 0.004231396829709411, + "learning_rate": 1.2586666666666667e-06, + "loss": 0.0002, + "num_tokens": 1837809.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 34.5, + "completions/mean_terminated_length": 34.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 115.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07929566502571106, + "kl": 0.031787254847586155, + "learning_rate": 1.2583333333333335e-06, + "loss": 0.0014, + "num_tokens": 1838167.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 115.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04284188896417618, + "kl": 0.04354099929332733, + "learning_rate": 1.258e-06, + "loss": 0.0022, + "num_tokens": 1838572.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 6227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 115.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1978859007358551, + "kl": 0.019339651567861438, + "learning_rate": 1.2576666666666666e-06, + "loss": 0.0011, + "num_tokens": 1838908.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 115.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04125714674592018, + "kl": 0.002634570235386491, + "learning_rate": 1.2573333333333334e-06, + "loss": 0.0001, + "num_tokens": 1839180.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 115.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22951935231685638, + "kl": 0.024778328835964203, + "learning_rate": 1.257e-06, + "loss": 0.0013, + "num_tokens": 1839472.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 115.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07451792061328888, + "kl": 0.02749769389629364, + "learning_rate": 1.2566666666666666e-06, + "loss": 0.0014, + "num_tokens": 1839774.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 115.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10316783934831619, + "kl": 0.023797186091542244, + "learning_rate": 1.2563333333333333e-06, + "loss": 0.0012, + "num_tokens": 1840048.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 115.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05592063441872597, + "kl": 0.005200710846111178, + "learning_rate": 1.2560000000000001e-06, + "loss": 0.0003, + "num_tokens": 1840350.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 115.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024762703105807304, + "kl": 0.0011641234159469604, + "learning_rate": 1.2556666666666667e-06, + "loss": 0.0001, + "num_tokens": 1840562.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 115.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.841092586517334, + "kl": 0.16811568662524223, + "learning_rate": 1.2553333333333335e-06, + "loss": 0.0078, + "num_tokens": 1840855.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 29.25, + "completions/mean_terminated_length": 29.25, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 115.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0937301516532898, + "kl": 0.042482590302824974, + "learning_rate": 1.255e-06, + "loss": 0.0021, + "num_tokens": 1841192.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 115.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.173649311065674, + "kl": 0.052505326457321644, + "learning_rate": 1.2546666666666666e-06, + "loss": 0.1775, + "num_tokens": 1841466.0, + "reward": 6.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 6.5, + "rewards/reward_combined/std": 2.0, + "step": 6237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 115.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0060199410654604435, + "kl": 0.2678922414779663, + "learning_rate": 1.2543333333333334e-06, + "loss": 0.0134, + "num_tokens": 1841770.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 36.0, + "completions/mean_terminated_length": 36.0, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 115.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21355508267879486, + "kl": 0.0890326090157032, + "learning_rate": 1.254e-06, + "loss": 0.0044, + "num_tokens": 1842142.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 115.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11880111694335938, + "kl": 0.017414493719115853, + "learning_rate": 1.2536666666666666e-06, + "loss": 0.001, + "num_tokens": 1842485.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 115.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011011568829417229, + "kl": 0.0006151107081677765, + "learning_rate": 1.2533333333333335e-06, + "loss": 0.0, + "num_tokens": 1842720.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 115.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04526427760720253, + "kl": 0.12831896916031837, + "learning_rate": 1.2530000000000001e-06, + "loss": 0.0065, + "num_tokens": 1843029.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 115.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018583742901682854, + "kl": 0.012669486925005913, + "learning_rate": 1.2526666666666667e-06, + "loss": 0.0006, + "num_tokens": 1843289.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 115.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000533915008418262, + "kl": 0.0012635865132324398, + "learning_rate": 1.2523333333333335e-06, + "loss": 0.0001, + "num_tokens": 1843569.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 115.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.43353140354156494, + "kl": 0.02571816649287939, + "learning_rate": 1.252e-06, + "loss": 0.002, + "num_tokens": 1843805.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 45.0, + "completions/max_terminated_length": 45.0, + "completions/mean_length": 34.25, + "completions/mean_terminated_length": 34.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 115.66666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5229374170303345, + "kl": 0.030647223815321922, + "learning_rate": 1.2516666666666666e-06, + "loss": 0.0047, + "num_tokens": 1844170.0, + "reward": 4.5, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 4.041451930999756, + "step": 6246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 115.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02296511083841324, + "kl": 0.0036907498724758625, + "learning_rate": 1.2513333333333334e-06, + "loss": 0.0002, + "num_tokens": 1844440.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 115.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21778427064418793, + "kl": 0.03142505802679807, + "learning_rate": 1.251e-06, + "loss": 0.0016, + "num_tokens": 1844725.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 115.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11077558994293213, + "kl": 0.007567106746137142, + "learning_rate": 1.2506666666666665e-06, + "loss": 0.0004, + "num_tokens": 1845023.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 115.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1737356334924698, + "kl": 0.0209151110611856, + "learning_rate": 1.2503333333333335e-06, + "loss": 0.001, + "num_tokens": 1845315.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 115.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018107322975993156, + "kl": 0.00026499107480049133, + "learning_rate": 1.25e-06, + "loss": 0.0, + "num_tokens": 1845575.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 115.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05953364074230194, + "kl": 0.003828287524811458, + "learning_rate": 1.2496666666666667e-06, + "loss": 0.0002, + "num_tokens": 1845851.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 115.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007998199434950948, + "kl": 0.0037676095962524414, + "learning_rate": 1.2493333333333335e-06, + "loss": 0.0002, + "num_tokens": 1846087.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 115.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012243026867508888, + "kl": 0.09739917516708374, + "learning_rate": 1.249e-06, + "loss": 0.0049, + "num_tokens": 1846459.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 115.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05324836075305939, + "kl": 0.01230662316083908, + "learning_rate": 1.2486666666666666e-06, + "loss": 0.0006, + "num_tokens": 1846786.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 115.85185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6422905921936035, + "kl": 0.07225701492279768, + "learning_rate": 1.2483333333333334e-06, + "loss": 0.0026, + "num_tokens": 1847102.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 6256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 115.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00014937584637664258, + "kl": 4.7013163566589355e-06, + "learning_rate": 1.248e-06, + "loss": 0.0, + "num_tokens": 1847322.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 6257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 115.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04233884811401367, + "kl": 0.001320550829404965, + "learning_rate": 1.2476666666666665e-06, + "loss": 0.0001, + "num_tokens": 1847590.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 115.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04144356772303581, + "kl": 0.008482268545776606, + "learning_rate": 1.2473333333333335e-06, + "loss": 0.0004, + "num_tokens": 1847872.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 115.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008734427392482758, + "kl": 0.0006128549721324816, + "learning_rate": 1.247e-06, + "loss": 0.0, + "num_tokens": 1848091.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 3.75, + "completions/mean_terminated_length": 3.75, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 115.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3464406132698059, + "kl": 0.04284094646573067, + "learning_rate": 1.2466666666666667e-06, + "loss": 0.0024, + "num_tokens": 1848302.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 115.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04640262573957443, + "kl": 0.0011221036547794938, + "learning_rate": 1.2463333333333334e-06, + "loss": 0.0001, + "num_tokens": 1848515.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 115.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09202194213867188, + "kl": 0.009811186231672764, + "learning_rate": 1.246e-06, + "loss": 0.0005, + "num_tokens": 1848808.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 116.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03162007033824921, + "kl": 0.015394420363008976, + "learning_rate": 1.2456666666666666e-06, + "loss": 0.0008, + "num_tokens": 1849095.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 116.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1439974457025528, + "kl": 0.0052751151961274445, + "learning_rate": 1.2453333333333334e-06, + "loss": 0.0003, + "num_tokens": 1849351.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 116.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15991061925888062, + "kl": 0.042885567992925644, + "learning_rate": 1.245e-06, + "loss": 0.0019, + "num_tokens": 1849644.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 46.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 96.0, + "completions/max_terminated_length": 96.0, + "completions/mean_length": 46.75, + "completions/mean_terminated_length": 46.75, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 116.05555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.141469955444336, + "kl": 0.045004235580563545, + "learning_rate": 1.2446666666666667e-06, + "loss": 0.2184, + "num_tokens": 1850047.0, + "reward": 4.5, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 4.041451930999756, + "step": 6267 + }, + { + "clip_ratio/high_max": 0.00909090880304575, + "clip_ratio/high_mean": 0.00909090880304575, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00909090880304575, + "completion_length": 30.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 30.75, + "completions/mean_terminated_length": 30.75, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 116.07407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.942110061645508, + "kl": 0.7526375111192465, + "learning_rate": 1.2443333333333335e-06, + "loss": 0.0343, + "num_tokens": 1850406.0, + "reward": 5.375, + "reward_std": 2.462214469909668, + "rewards/reward_combined/mean": 5.375, + "rewards/reward_combined/std": 2.462214469909668, + "step": 6268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 116.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03768537566065788, + "kl": 0.00553034245967865, + "learning_rate": 1.244e-06, + "loss": 0.0003, + "num_tokens": 1850699.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 116.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07933907955884933, + "kl": 0.004391956143081188, + "learning_rate": 1.2436666666666666e-06, + "loss": 0.0002, + "num_tokens": 1850995.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 116.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00708309980109334, + "kl": 0.0014140590792521834, + "learning_rate": 1.2433333333333334e-06, + "loss": 0.0001, + "num_tokens": 1851272.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 116.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0064334324561059475, + "kl": 0.2678179293870926, + "learning_rate": 1.243e-06, + "loss": 0.0134, + "num_tokens": 1851576.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 116.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000789306708611548, + "kl": 0.003765881061553955, + "learning_rate": 1.2426666666666666e-06, + "loss": 0.0002, + "num_tokens": 1851812.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 116.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020940933376550674, + "kl": 0.012090378440916538, + "learning_rate": 1.2423333333333334e-06, + "loss": 0.0006, + "num_tokens": 1852072.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 116.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010890180245041847, + "kl": 0.000506911426782608, + "learning_rate": 1.242e-06, + "loss": 0.0, + "num_tokens": 1852332.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 116.22222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.9097113609313965, + "kl": 0.03586278576403856, + "learning_rate": 1.2416666666666667e-06, + "loss": -0.0026, + "num_tokens": 1852640.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 6276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 116.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06783688068389893, + "kl": 0.018612314481288195, + "learning_rate": 1.2413333333333335e-06, + "loss": 0.001, + "num_tokens": 1852914.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 116.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012533112429082394, + "kl": 0.09734746441245079, + "learning_rate": 1.241e-06, + "loss": 0.0049, + "num_tokens": 1853286.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 116.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02706988900899887, + "kl": 0.001588658895343542, + "learning_rate": 1.2406666666666666e-06, + "loss": 0.0001, + "num_tokens": 1853558.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 116.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023116370663046837, + "kl": 0.0013605743151856586, + "learning_rate": 1.2403333333333334e-06, + "loss": 0.0001, + "num_tokens": 1853777.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 116.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1760350912809372, + "kl": 0.18322789669036865, + "learning_rate": 1.24e-06, + "loss": 0.0092, + "num_tokens": 1854085.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 49.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 49.0, + "completions/mean_terminated_length": 49.0, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 116.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04928117245435715, + "kl": 0.009053825866430998, + "learning_rate": 1.2396666666666668e-06, + "loss": 0.0005, + "num_tokens": 1854501.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 116.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05760377272963524, + "kl": 0.015146711841225624, + "learning_rate": 1.2393333333333333e-06, + "loss": 0.0008, + "num_tokens": 1854802.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 116.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17278456687927246, + "kl": 0.06444323062896729, + "learning_rate": 1.239e-06, + "loss": 0.0032, + "num_tokens": 1855206.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 6284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 116.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04154883325099945, + "kl": 0.001979883905733004, + "learning_rate": 1.2386666666666667e-06, + "loss": 0.0001, + "num_tokens": 1855482.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 116.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09533293545246124, + "kl": 0.01082206517457962, + "learning_rate": 1.2383333333333335e-06, + "loss": 0.0005, + "num_tokens": 1855782.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 116.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14209595322608948, + "kl": 0.017969570588320494, + "learning_rate": 1.238e-06, + "loss": 0.0009, + "num_tokens": 1856095.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 116.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5980515480041504, + "kl": 0.08496873347030487, + "learning_rate": 1.2376666666666666e-06, + "loss": 0.0045, + "num_tokens": 1856339.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6288 + }, + { + "clip_ratio/high_max": 0.006097560748457909, + "clip_ratio/high_mean": 0.006097560748457909, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006097560748457909, + "completion_length": 40.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 40.75, + "completions/mean_terminated_length": 40.75, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 116.46296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7978053092956543, + "kl": 0.08401504904031754, + "learning_rate": 1.2373333333333334e-06, + "loss": 0.0982, + "num_tokens": 1856726.0, + "reward": 5.25, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 4.5, + "step": 6289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 116.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0768980011343956, + "kl": 0.007214951561763883, + "learning_rate": 1.237e-06, + "loss": 0.0004, + "num_tokens": 1857056.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 116.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0632239505648613, + "kl": 0.0033106408081948757, + "learning_rate": 1.2366666666666668e-06, + "loss": 0.0002, + "num_tokens": 1857375.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 41.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 41.25, + "completions/mean_terminated_length": 41.25, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 116.51851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6029794216156006, + "kl": 0.14259984344244003, + "learning_rate": 1.2363333333333333e-06, + "loss": 0.0641, + "num_tokens": 1857768.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 6292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 116.53703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.514859199523926, + "kl": 0.05777551420032978, + "learning_rate": 1.236e-06, + "loss": -0.0882, + "num_tokens": 1858047.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 6293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 116.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02193385921418667, + "kl": 0.001887825084850192, + "learning_rate": 1.2356666666666667e-06, + "loss": 0.0001, + "num_tokens": 1858307.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 116.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0277364794164896, + "kl": 0.0019534826278686523, + "learning_rate": 1.2353333333333335e-06, + "loss": 0.0001, + "num_tokens": 1858519.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 116.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028444884344935417, + "kl": 0.004912725416943431, + "learning_rate": 1.235e-06, + "loss": 0.0002, + "num_tokens": 1858807.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 116.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23661844432353973, + "kl": 0.022035363130271435, + "learning_rate": 1.2346666666666668e-06, + "loss": 0.0011, + "num_tokens": 1859065.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 116.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10691499710083008, + "kl": 0.021265359595417976, + "learning_rate": 1.2343333333333334e-06, + "loss": 0.001, + "num_tokens": 1859364.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 116.64814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.598431348800659, + "kl": 0.06902923434972763, + "learning_rate": 1.234e-06, + "loss": -0.0137, + "num_tokens": 1859667.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 6299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 116.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06887342035770416, + "kl": 0.024940460920333862, + "learning_rate": 1.2336666666666667e-06, + "loss": 0.0013, + "num_tokens": 1859991.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 116.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17546403408050537, + "kl": 0.013412305852398276, + "learning_rate": 1.2333333333333333e-06, + "loss": 0.0006, + "num_tokens": 1860257.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 116.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08546403050422668, + "kl": 0.00312786060385406, + "learning_rate": 1.2329999999999999e-06, + "loss": 0.0002, + "num_tokens": 1860467.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 116.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06565514951944351, + "kl": 0.005158094922080636, + "learning_rate": 1.2326666666666669e-06, + "loss": 0.0003, + "num_tokens": 1860798.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 116.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006790246348828077, + "kl": 0.00011561065912246704, + "learning_rate": 1.2323333333333334e-06, + "loss": 0.0, + "num_tokens": 1861010.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 116.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061141323298215866, + "kl": 0.04211212135851383, + "learning_rate": 1.232e-06, + "loss": 0.0022, + "num_tokens": 1861301.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 116.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.4490175247192383, + "kl": 0.32804083079099655, + "learning_rate": 1.2316666666666668e-06, + "loss": 0.0182, + "num_tokens": 1861522.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 6306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 116.79629629629629, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.791076898574829, + "kl": 0.07484490307979286, + "learning_rate": 1.2313333333333334e-06, + "loss": 0.0023, + "num_tokens": 1861788.0, + "reward": 5.5, + "reward_std": 3.316624879837036, + "rewards/reward_combined/mean": 5.5, + "rewards/reward_combined/std": 3.316624879837036, + "step": 6307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 116.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015503501519560814, + "kl": 0.0007985396514413878, + "learning_rate": 1.231e-06, + "loss": 0.0, + "num_tokens": 1862097.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 116.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05024037882685661, + "kl": 0.010773248039186, + "learning_rate": 1.2306666666666667e-06, + "loss": 0.0005, + "num_tokens": 1862432.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 116.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011990761384367943, + "kl": 0.00043349614134058356, + "learning_rate": 1.2303333333333333e-06, + "loss": 0.0, + "num_tokens": 1862702.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 116.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08177473396062851, + "kl": 0.014754015253856778, + "learning_rate": 1.2299999999999999e-06, + "loss": 0.0008, + "num_tokens": 1862986.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 116.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004946709610521793, + "kl": 0.00031425655470229685, + "learning_rate": 1.2296666666666669e-06, + "loss": 0.0, + "num_tokens": 1863246.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 116.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17845740914344788, + "kl": 0.020642086397856474, + "learning_rate": 1.2293333333333334e-06, + "loss": 0.0011, + "num_tokens": 1863538.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 116.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0068844957277178764, + "kl": 0.00207352451980114, + "learning_rate": 1.229e-06, + "loss": 0.0001, + "num_tokens": 1863850.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 116.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024111207574605942, + "kl": 0.000682694575516507, + "learning_rate": 1.2286666666666668e-06, + "loss": 0.0, + "num_tokens": 1864084.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 116.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020938321948051453, + "kl": 0.002724895952269435, + "learning_rate": 1.2283333333333334e-06, + "loss": 0.0001, + "num_tokens": 1864366.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 116.98148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.6453166007995605, + "kl": 0.02425048127770424, + "learning_rate": 1.228e-06, + "loss": -0.0214, + "num_tokens": 1864684.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 6317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 117.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11007286608219147, + "kl": 0.0028574815951287746, + "learning_rate": 1.2276666666666667e-06, + "loss": 0.0002, + "num_tokens": 1864911.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 117.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015436398796737194, + "kl": 0.0014515546499751508, + "learning_rate": 1.2273333333333333e-06, + "loss": 0.0001, + "num_tokens": 1865179.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 117.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6589757204055786, + "kl": 0.05297594587318599, + "learning_rate": 1.2269999999999999e-06, + "loss": 0.003, + "num_tokens": 1865475.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 117.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020373262465000153, + "kl": 0.009699301328510046, + "learning_rate": 1.2266666666666669e-06, + "loss": 0.0005, + "num_tokens": 1865769.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 117.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059715352952480316, + "kl": 0.0029501643730327487, + "learning_rate": 1.2263333333333334e-06, + "loss": 0.0001, + "num_tokens": 1866065.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 36.5, + "completions/mean_terminated_length": 36.5, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 117.0925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0689995288848877, + "kl": 0.4349171072244644, + "learning_rate": 1.226e-06, + "loss": 0.0377, + "num_tokens": 1866439.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 6323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 117.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004212557338178158, + "kl": 0.0002928256872110069, + "learning_rate": 1.2256666666666668e-06, + "loss": 0.0, + "num_tokens": 1866659.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 117.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061383381485939026, + "kl": 0.0010862275958061218, + "learning_rate": 1.2253333333333333e-06, + "loss": 0.0001, + "num_tokens": 1866869.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 117.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005569239612668753, + "kl": 0.0006695190968457609, + "learning_rate": 1.225e-06, + "loss": 0.0, + "num_tokens": 1867085.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 117.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04014762118458748, + "kl": 0.0036010071635246277, + "learning_rate": 1.2246666666666667e-06, + "loss": 0.0002, + "num_tokens": 1867301.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 117.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12944601476192474, + "kl": 0.03632536344230175, + "learning_rate": 1.2243333333333333e-06, + "loss": 0.0018, + "num_tokens": 1867572.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 117.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012796190567314625, + "kl": 0.00048668310046195984, + "learning_rate": 1.224e-06, + "loss": 0.0, + "num_tokens": 1867832.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 117.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5973250865936279, + "kl": 0.04203704744577408, + "learning_rate": 1.2236666666666668e-06, + "loss": 0.0021, + "num_tokens": 1868068.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 117.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024066155776381493, + "kl": 0.000497678731335327, + "learning_rate": 1.2233333333333334e-06, + "loss": 0.0, + "num_tokens": 1868317.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 117.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007013546768575907, + "kl": 0.26768939197063446, + "learning_rate": 1.223e-06, + "loss": 0.0134, + "num_tokens": 1868621.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 117.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09192944318056107, + "kl": 0.014576570130884647, + "learning_rate": 1.2226666666666668e-06, + "loss": 0.0006, + "num_tokens": 1868914.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 117.29629629629629, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.656205177307129, + "kl": 0.06435461342334747, + "learning_rate": 1.2223333333333333e-06, + "loss": 0.0036, + "num_tokens": 1869248.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 6334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 117.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00018164316134061664, + "kl": 5.990266799926758e-06, + "learning_rate": 1.222e-06, + "loss": 0.0, + "num_tokens": 1869468.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 6335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 117.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1757955104112625, + "kl": 0.05981072038412094, + "learning_rate": 1.2216666666666667e-06, + "loss": 0.0029, + "num_tokens": 1869774.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 117.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13805396854877472, + "kl": 0.017180890077725053, + "learning_rate": 1.2213333333333333e-06, + "loss": 0.0009, + "num_tokens": 1870068.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 117.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03243164345622063, + "kl": 0.004565113689750433, + "learning_rate": 1.221e-06, + "loss": 0.0002, + "num_tokens": 1870372.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 117.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016014624387025833, + "kl": 0.0007988003198988736, + "learning_rate": 1.2206666666666668e-06, + "loss": 0.0, + "num_tokens": 1870634.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 117.4074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.478219985961914, + "kl": 0.1537754898890853, + "learning_rate": 1.2203333333333334e-06, + "loss": 0.0279, + "num_tokens": 1870938.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 6340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 117.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061416368931531906, + "kl": 0.005006879044231027, + "learning_rate": 1.22e-06, + "loss": 0.0003, + "num_tokens": 1871265.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.75, + "completions/mean_terminated_length": 23.75, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 117.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1589430272579193, + "kl": 0.02481132373213768, + "learning_rate": 1.2196666666666667e-06, + "loss": 0.0012, + "num_tokens": 1871580.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 37.25, + "completions/mean_terminated_length": 37.25, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 117.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05281635746359825, + "kl": 0.03615730442106724, + "learning_rate": 1.2193333333333333e-06, + "loss": 0.0018, + "num_tokens": 1871953.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 117.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006940820254385471, + "kl": 0.0001445829875592608, + "learning_rate": 1.219e-06, + "loss": 0.0, + "num_tokens": 1872209.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 8.75, + "completions/mean_terminated_length": 8.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 117.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0402204766869545, + "kl": 0.009526151698082685, + "learning_rate": 1.2186666666666667e-06, + "loss": 0.0005, + "num_tokens": 1872468.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 117.51851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.200370788574219, + "kl": 0.1066192900761962, + "learning_rate": 1.2183333333333332e-06, + "loss": 0.0946, + "num_tokens": 1872771.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 6346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 28.75, + "completions/mean_terminated_length": 28.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 117.53703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3626270294189453, + "kl": 0.016668145544826984, + "learning_rate": 1.218e-06, + "loss": 0.1227, + "num_tokens": 1873114.0, + "reward": 5.75, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 4.5, + "step": 6347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 117.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049479641020298004, + "kl": 0.006176235852763057, + "learning_rate": 1.2176666666666668e-06, + "loss": 0.0003, + "num_tokens": 1873396.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 117.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15409399569034576, + "kl": 0.011553944554179907, + "learning_rate": 1.2173333333333334e-06, + "loss": 0.0006, + "num_tokens": 1873654.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 117.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023245124146342278, + "kl": 0.1609538048505783, + "learning_rate": 1.217e-06, + "loss": 0.008, + "num_tokens": 1873963.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 117.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00604587746784091, + "kl": 0.00033239772892557085, + "learning_rate": 1.2166666666666667e-06, + "loss": 0.0, + "num_tokens": 1874235.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 117.62962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.688149929046631, + "kl": 0.03636751603335142, + "learning_rate": 1.2163333333333333e-06, + "loss": 0.1454, + "num_tokens": 1874512.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 6352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 117.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012772396206855774, + "kl": 0.09722325205802917, + "learning_rate": 1.216e-06, + "loss": 0.0049, + "num_tokens": 1874884.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 117.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061891283839941025, + "kl": 0.004158852971158922, + "learning_rate": 1.2156666666666667e-06, + "loss": 0.0002, + "num_tokens": 1875138.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 32.75, + "completions/mean_terminated_length": 32.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 117.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05476026237010956, + "kl": 0.03428677376359701, + "learning_rate": 1.2153333333333332e-06, + "loss": 0.0017, + "num_tokens": 1875549.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 6355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 117.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08282981067895889, + "kl": 0.02204325655475259, + "learning_rate": 1.215e-06, + "loss": 0.0011, + "num_tokens": 1875886.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 117.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07960408926010132, + "kl": 0.008824027609080076, + "learning_rate": 1.2146666666666668e-06, + "loss": 0.0004, + "num_tokens": 1876216.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 117.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04834507405757904, + "kl": 0.001701838686130941, + "learning_rate": 1.2143333333333334e-06, + "loss": 0.0001, + "num_tokens": 1876488.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 117.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02579350955784321, + "kl": 0.00026485323905944824, + "learning_rate": 1.2140000000000002e-06, + "loss": 0.0, + "num_tokens": 1876700.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 117.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.47691863775253296, + "kl": 0.10419574286788702, + "learning_rate": 1.2136666666666667e-06, + "loss": 0.0046, + "num_tokens": 1876991.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 117.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019206425175070763, + "kl": 0.0009514418197795749, + "learning_rate": 1.2133333333333333e-06, + "loss": 0.0, + "num_tokens": 1877263.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 117.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05284281075000763, + "kl": 0.009344940539449453, + "learning_rate": 1.213e-06, + "loss": 0.0005, + "num_tokens": 1877527.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.25, + "completions/mean_terminated_length": 6.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 117.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0187024287879467, + "kl": 0.0004899409395875409, + "learning_rate": 1.2126666666666666e-06, + "loss": 0.0, + "num_tokens": 1877760.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 30.5, + "completions/mean_terminated_length": 30.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 117.85185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7793655395507812, + "kl": 0.044804759323596954, + "learning_rate": 1.2123333333333332e-06, + "loss": 0.1268, + "num_tokens": 1878102.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 6364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 117.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006062432657927275, + "kl": 0.0003211127477698028, + "learning_rate": 1.2120000000000002e-06, + "loss": 0.0, + "num_tokens": 1878416.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 117.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012449974194169044, + "kl": 0.00191789137898013, + "learning_rate": 1.2116666666666668e-06, + "loss": 0.0001, + "num_tokens": 1878696.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 117.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02751999720931053, + "kl": 0.009616275317966938, + "learning_rate": 1.2113333333333334e-06, + "loss": 0.0005, + "num_tokens": 1879022.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 117.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04843913018703461, + "kl": 0.002564593218266964, + "learning_rate": 1.2110000000000001e-06, + "loss": 0.0001, + "num_tokens": 1879345.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 117.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06524127721786499, + "kl": 0.030951189808547497, + "learning_rate": 1.2106666666666667e-06, + "loss": 0.0015, + "num_tokens": 1879682.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 117.96296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.5153093338012695, + "kl": 0.0432470440864563, + "learning_rate": 1.2103333333333333e-06, + "loss": -0.036, + "num_tokens": 1879963.0, + "reward": 7.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.875, + "rewards/reward_combined/std": 0.25, + "step": 6370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 117.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01231038011610508, + "kl": 0.0023086676374077797, + "learning_rate": 1.21e-06, + "loss": 0.0001, + "num_tokens": 1880275.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 37.0, + "completions/mean_terminated_length": 37.0, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 118.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2309725284576416, + "kl": 0.058851104229688644, + "learning_rate": 1.2096666666666666e-06, + "loss": -0.0089, + "num_tokens": 1880639.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 6372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 118.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0037684952840209007, + "kl": 6.638467311859131e-05, + "learning_rate": 1.2093333333333332e-06, + "loss": 0.0, + "num_tokens": 1880851.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 118.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04324539005756378, + "kl": 0.00459635304287076, + "learning_rate": 1.2090000000000002e-06, + "loss": 0.0002, + "num_tokens": 1881144.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.75, + "completions/mean_terminated_length": 10.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 118.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030519749969244003, + "kl": 0.0010947728587780148, + "learning_rate": 1.2086666666666668e-06, + "loss": 0.0001, + "num_tokens": 1881415.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 118.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20741461217403412, + "kl": 0.02500736666843295, + "learning_rate": 1.2083333333333333e-06, + "loss": 0.0012, + "num_tokens": 1881711.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 118.0925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2913591861724854, + "kl": 0.06482912134379148, + "learning_rate": 1.2080000000000001e-06, + "loss": -0.2348, + "num_tokens": 1882057.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 6377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 118.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17156784236431122, + "kl": 0.012978978455066681, + "learning_rate": 1.2076666666666667e-06, + "loss": 0.0007, + "num_tokens": 1882323.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 118.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009120596572756767, + "kl": 0.0014048232696950436, + "learning_rate": 1.2073333333333333e-06, + "loss": 0.0001, + "num_tokens": 1882619.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 118.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18915152549743652, + "kl": 0.05025875195860863, + "learning_rate": 1.207e-06, + "loss": 0.0025, + "num_tokens": 1882923.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 118.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01114656776189804, + "kl": 0.0013795166742056608, + "learning_rate": 1.2066666666666666e-06, + "loss": 0.0001, + "num_tokens": 1883183.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 118.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04206422343850136, + "kl": 0.0020734071731567383, + "learning_rate": 1.2063333333333332e-06, + "loss": 0.0001, + "num_tokens": 1883447.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 118.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05972485616803169, + "kl": 0.006392789771780372, + "learning_rate": 1.2060000000000002e-06, + "loss": 0.0003, + "num_tokens": 1883773.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 118.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05720462277531624, + "kl": 0.009414592292159796, + "learning_rate": 1.2056666666666668e-06, + "loss": 0.0005, + "num_tokens": 1884108.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 118.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05203743651509285, + "kl": 0.01133543811738491, + "learning_rate": 1.2053333333333333e-06, + "loss": 0.0006, + "num_tokens": 1884372.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 118.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0206092968583107, + "kl": 0.05186851881444454, + "learning_rate": 1.2050000000000001e-06, + "loss": 0.0026, + "num_tokens": 1884704.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 36.0, + "completions/mean_terminated_length": 36.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 118.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0783233642578125, + "kl": 0.03990246541798115, + "learning_rate": 1.2046666666666667e-06, + "loss": 0.002, + "num_tokens": 1885072.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 118.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05945427715778351, + "kl": 0.0060268850065767765, + "learning_rate": 1.2043333333333333e-06, + "loss": 0.0003, + "num_tokens": 1885332.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 118.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03179972618818283, + "kl": 0.006192940287292004, + "learning_rate": 1.204e-06, + "loss": 0.0003, + "num_tokens": 1885600.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 118.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13461224734783173, + "kl": 0.02167446445673704, + "learning_rate": 1.2036666666666666e-06, + "loss": 0.0011, + "num_tokens": 1885890.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 118.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042726099491119385, + "kl": 0.004009530181065202, + "learning_rate": 1.2033333333333334e-06, + "loss": 0.0002, + "num_tokens": 1886218.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 118.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1628943532705307, + "kl": 0.02399781160056591, + "learning_rate": 1.2030000000000002e-06, + "loss": 0.0014, + "num_tokens": 1886486.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 32.0, + "completions/mean_terminated_length": 32.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 118.38888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3520936965942383, + "kl": 0.17479710280895233, + "learning_rate": 1.2026666666666667e-06, + "loss": -0.2146, + "num_tokens": 1886842.0, + "reward": 7.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.875, + "rewards/reward_combined/std": 0.25, + "step": 6393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 118.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09283898025751114, + "kl": 0.006904813519213349, + "learning_rate": 1.2023333333333333e-06, + "loss": 0.0004, + "num_tokens": 1887082.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 118.42592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.517305374145508, + "kl": 0.031096127349883318, + "learning_rate": 1.202e-06, + "loss": 0.0803, + "num_tokens": 1887359.0, + "reward": 5.875, + "reward_std": 3.25, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 3.25, + "step": 6395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 36.0, + "completions/mean_terminated_length": 36.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 118.44444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7017276287078857, + "kl": 0.1395142897963524, + "learning_rate": 1.2016666666666667e-06, + "loss": -0.0128, + "num_tokens": 1887719.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 6396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 118.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019146645441651344, + "kl": 0.0023343415232375264, + "learning_rate": 1.2013333333333332e-06, + "loss": 0.0001, + "num_tokens": 1888033.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 118.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007597935618832707, + "kl": 0.001225618296302855, + "learning_rate": 1.201e-06, + "loss": 0.0001, + "num_tokens": 1888313.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 118.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05427346006035805, + "kl": 0.2748561501502991, + "learning_rate": 1.2006666666666666e-06, + "loss": 0.0137, + "num_tokens": 1888617.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 118.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11704078316688538, + "kl": 0.0389588437974453, + "learning_rate": 1.2003333333333334e-06, + "loss": 0.0019, + "num_tokens": 1888890.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 118.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054681889712810516, + "kl": 0.001852313638664782, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.0001, + "num_tokens": 1889102.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 118.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06008169800043106, + "kl": 0.01628345251083374, + "learning_rate": 1.1996666666666667e-06, + "loss": 0.0009, + "num_tokens": 1889431.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 118.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1296497881412506, + "kl": 0.027248432859778404, + "learning_rate": 1.1993333333333333e-06, + "loss": 0.0014, + "num_tokens": 1889751.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 118.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01889699138700962, + "kl": 0.0012212160945637152, + "learning_rate": 1.199e-06, + "loss": 0.0001, + "num_tokens": 1890060.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 118.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021963927894830704, + "kl": 0.001077877648640424, + "learning_rate": 1.1986666666666667e-06, + "loss": 0.0001, + "num_tokens": 1890340.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.5, + "completions/mean_terminated_length": 5.5, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 118.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06439710408449173, + "kl": 0.001757104037096724, + "learning_rate": 1.1983333333333334e-06, + "loss": 0.0001, + "num_tokens": 1890562.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 118.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05360562726855278, + "kl": 0.008765297010540962, + "learning_rate": 1.198e-06, + "loss": 0.0004, + "num_tokens": 1890844.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 118.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01481380220502615, + "kl": 0.0019328041234984994, + "learning_rate": 1.1976666666666666e-06, + "loss": 0.0001, + "num_tokens": 1891126.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 118.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037906572222709656, + "kl": 0.004823529860004783, + "learning_rate": 1.1973333333333334e-06, + "loss": 0.0002, + "num_tokens": 1891419.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 118.70370370370371, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.52617883682251, + "kl": 0.010026630014181137, + "learning_rate": 1.1970000000000001e-06, + "loss": -0.0778, + "num_tokens": 1891717.0, + "reward": 6.125, + "reward_std": 3.75, + "rewards/reward_combined/mean": 6.125, + "rewards/reward_combined/std": 3.75, + "step": 6410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 30.75, + "completions/mean_terminated_length": 30.75, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 118.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15768961608409882, + "kl": 0.06763637065887451, + "learning_rate": 1.1966666666666667e-06, + "loss": 0.0034, + "num_tokens": 1892120.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 6411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 118.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014757541008293629, + "kl": 0.09695859253406525, + "learning_rate": 1.1963333333333333e-06, + "loss": 0.0048, + "num_tokens": 1892492.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 118.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05760321766138077, + "kl": 0.013830000767484307, + "learning_rate": 1.196e-06, + "loss": 0.0007, + "num_tokens": 1892762.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 118.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0746724009513855, + "kl": 0.01655101589858532, + "learning_rate": 1.1956666666666666e-06, + "loss": 0.0008, + "num_tokens": 1893069.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 118.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007975440821610391, + "kl": 0.0037554726004600525, + "learning_rate": 1.1953333333333334e-06, + "loss": 0.0002, + "num_tokens": 1893305.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 118.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025200095027685165, + "kl": 0.0032391101121902466, + "learning_rate": 1.195e-06, + "loss": 0.0002, + "num_tokens": 1893521.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 118.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05634867399930954, + "kl": 0.0026866591069847345, + "learning_rate": 1.1946666666666666e-06, + "loss": 0.0001, + "num_tokens": 1893840.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 118.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022323521261569113, + "kl": 8.203089237213135e-06, + "learning_rate": 1.1943333333333333e-06, + "loss": 0.0, + "num_tokens": 1894060.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 6418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 118.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01999380998313427, + "kl": 0.0007640570402145386, + "learning_rate": 1.1940000000000001e-06, + "loss": 0.0, + "num_tokens": 1894272.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 118.88888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 10.764765739440918, + "kl": 0.015726592391729355, + "learning_rate": 1.1936666666666667e-06, + "loss": 0.2206, + "num_tokens": 1894528.0, + "reward": 3.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 3.875, + "rewards/reward_combined/std": 0.25, + "step": 6420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 118.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031007084995508194, + "kl": 0.00040774644367047586, + "learning_rate": 1.1933333333333335e-06, + "loss": 0.0, + "num_tokens": 1894784.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 118.92592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9712905883789062, + "kl": 0.1457047387957573, + "learning_rate": 1.193e-06, + "loss": -0.0244, + "num_tokens": 1895106.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 6422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 118.94444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.586273193359375, + "kl": 0.036043503787368536, + "learning_rate": 1.1926666666666666e-06, + "loss": 0.1291, + "num_tokens": 1895407.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 6423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 118.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3200676739215851, + "kl": 0.017675194423645735, + "learning_rate": 1.1923333333333334e-06, + "loss": 0.0009, + "num_tokens": 1895728.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 118.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07494300603866577, + "kl": 0.035678806249052286, + "learning_rate": 1.192e-06, + "loss": 0.0018, + "num_tokens": 1896022.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 119.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021483000367879868, + "kl": 0.0006169751286506653, + "learning_rate": 1.1916666666666666e-06, + "loss": 0.0, + "num_tokens": 1896282.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 119.01851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.293263912200928, + "kl": 0.08885498903691769, + "learning_rate": 1.1913333333333335e-06, + "loss": 0.1419, + "num_tokens": 1896637.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 6427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 119.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06423847377300262, + "kl": 0.009796207305043936, + "learning_rate": 1.1910000000000001e-06, + "loss": 0.0005, + "num_tokens": 1896938.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 119.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03536780923604965, + "kl": 0.0020239034784026444, + "learning_rate": 1.1906666666666667e-06, + "loss": 0.0001, + "num_tokens": 1897261.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 119.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032690420746803284, + "kl": 0.002398474025540054, + "learning_rate": 1.1903333333333335e-06, + "loss": 0.0001, + "num_tokens": 1897570.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 119.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07405785471200943, + "kl": 0.0016200989484786987, + "learning_rate": 1.19e-06, + "loss": 0.0001, + "num_tokens": 1897778.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 119.11111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.832571029663086, + "kl": 0.4032875234261155, + "learning_rate": 1.1896666666666666e-06, + "loss": 0.044, + "num_tokens": 1898039.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 6432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 119.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16345266997814178, + "kl": 0.012418131460435688, + "learning_rate": 1.1893333333333334e-06, + "loss": 0.0008, + "num_tokens": 1898278.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 119.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026300711557269096, + "kl": 0.002880724292481318, + "learning_rate": 1.189e-06, + "loss": 0.0002, + "num_tokens": 1898548.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 119.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048063069581985474, + "kl": 0.0028284870786592364, + "learning_rate": 1.1886666666666665e-06, + "loss": 0.0001, + "num_tokens": 1898821.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 119.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018144050613045692, + "kl": 0.09649721160531044, + "learning_rate": 1.1883333333333335e-06, + "loss": 0.0048, + "num_tokens": 1899193.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 119.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10659333318471909, + "kl": 0.04098423197865486, + "learning_rate": 1.188e-06, + "loss": 0.0021, + "num_tokens": 1899504.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 33.75, + "completions/mean_terminated_length": 33.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 119.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05494823306798935, + "kl": 0.03383501060307026, + "learning_rate": 1.1876666666666667e-06, + "loss": 0.0017, + "num_tokens": 1899863.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 119.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012870515696704388, + "kl": 0.0003301863180240616, + "learning_rate": 1.1873333333333335e-06, + "loss": 0.0, + "num_tokens": 1900106.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 119.25925925925925, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515233039855957, + "kl": 0.8560158014297485, + "learning_rate": 1.187e-06, + "loss": 0.0608, + "num_tokens": 1900411.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 6440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 119.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02664221078157425, + "kl": 0.007528051733970642, + "learning_rate": 1.1866666666666666e-06, + "loss": 0.0004, + "num_tokens": 1900671.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 119.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00022559388889931142, + "kl": 9.864568710327148e-06, + "learning_rate": 1.1863333333333334e-06, + "loss": 0.0, + "num_tokens": 1900891.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 6442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 119.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017285572364926338, + "kl": 0.0004815608263015747, + "learning_rate": 1.186e-06, + "loss": 0.0, + "num_tokens": 1901103.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 27.75, + "completions/mean_terminated_length": 27.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 119.33333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.845367670059204, + "kl": 0.18102366849780083, + "learning_rate": 1.1856666666666665e-06, + "loss": 0.0167, + "num_tokens": 1901450.0, + "reward": 5.875, + "reward_std": 3.25, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 3.25, + "step": 6444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 119.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006941179162822664, + "kl": 0.0012119284365326166, + "learning_rate": 1.1853333333333335e-06, + "loss": 0.0001, + "num_tokens": 1901730.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 119.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012484087608754635, + "kl": 0.0045114741660654545, + "learning_rate": 1.185e-06, + "loss": 0.0002, + "num_tokens": 1902024.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 36.0, + "completions/mean_terminated_length": 36.0, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 119.38888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7878062725067139, + "kl": 0.10964963585138321, + "learning_rate": 1.1846666666666667e-06, + "loss": -0.08, + "num_tokens": 1902396.0, + "reward": 6.75, + "reward_std": 2.5, + "rewards/reward_combined/mean": 6.75, + "rewards/reward_combined/std": 2.5, + "step": 6447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 119.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014289182610809803, + "kl": 0.0016539028147235513, + "learning_rate": 1.1843333333333334e-06, + "loss": 0.0001, + "num_tokens": 1902680.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 119.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006454470567405224, + "kl": 7.849186658859253e-05, + "learning_rate": 1.184e-06, + "loss": 0.0, + "num_tokens": 1902892.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 119.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07206703722476959, + "kl": 0.014787786640226841, + "learning_rate": 1.1836666666666666e-06, + "loss": 0.0007, + "num_tokens": 1903199.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 119.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1333763152360916, + "kl": 0.03642389178276062, + "learning_rate": 1.1833333333333334e-06, + "loss": 0.0018, + "num_tokens": 1903489.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 119.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21446919441223145, + "kl": 0.0358577836304903, + "learning_rate": 1.183e-06, + "loss": 0.0017, + "num_tokens": 1903759.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 27.5, + "completions/mean_terminated_length": 27.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 119.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0662379041314125, + "kl": 0.014518793672323227, + "learning_rate": 1.1826666666666667e-06, + "loss": 0.0007, + "num_tokens": 1904097.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 119.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020288633182644844, + "kl": 0.005442020716145635, + "learning_rate": 1.1823333333333335e-06, + "loss": 0.0003, + "num_tokens": 1904365.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 119.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059898924082517624, + "kl": 0.00407529016956687, + "learning_rate": 1.182e-06, + "loss": 0.0002, + "num_tokens": 1904661.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 27.0, + "completions/mean_terminated_length": 27.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 119.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045951537787914276, + "kl": 0.012956413440406322, + "learning_rate": 1.1816666666666666e-06, + "loss": 0.0007, + "num_tokens": 1904989.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 119.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014931109035387635, + "kl": 0.0013553500175476074, + "learning_rate": 1.1813333333333334e-06, + "loss": 0.0001, + "num_tokens": 1905205.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 119.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0833604484796524, + "kl": 0.016794190276414156, + "learning_rate": 1.181e-06, + "loss": 0.0008, + "num_tokens": 1905541.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 47.0, + "completions/max_terminated_length": 47.0, + "completions/mean_length": 36.0, + "completions/mean_terminated_length": 36.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 119.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9779343605041504, + "kl": 0.062278375029563904, + "learning_rate": 1.1806666666666666e-06, + "loss": -0.0754, + "num_tokens": 1905901.0, + "reward": 5.75, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 2.598076105117798, + "step": 6459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 119.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008301954949274659, + "kl": 0.0037469416856765747, + "learning_rate": 1.1803333333333334e-06, + "loss": 0.0002, + "num_tokens": 1906137.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 119.64814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4293761253356934, + "kl": 0.10022081807255745, + "learning_rate": 1.18e-06, + "loss": 0.1051, + "num_tokens": 1906457.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 6461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 119.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12528786063194275, + "kl": 0.017758074216544628, + "learning_rate": 1.1796666666666667e-06, + "loss": 0.001, + "num_tokens": 1906759.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 119.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03259257972240448, + "kl": 0.0041419247863814235, + "learning_rate": 1.1793333333333335e-06, + "loss": 0.0002, + "num_tokens": 1907090.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 119.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0032860408537089825, + "kl": 7.447600364685059e-05, + "learning_rate": 1.179e-06, + "loss": 0.0, + "num_tokens": 1907350.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 32.5, + "completions/mean_terminated_length": 32.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 119.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0377628318965435, + "kl": 0.032129768282175064, + "learning_rate": 1.1786666666666666e-06, + "loss": 0.0016, + "num_tokens": 1907760.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 6465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 119.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022190527990460396, + "kl": 0.0008364841341972351, + "learning_rate": 1.1783333333333334e-06, + "loss": 0.0, + "num_tokens": 1908020.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 119.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02222239039838314, + "kl": 0.0007166534633142874, + "learning_rate": 1.178e-06, + "loss": 0.0, + "num_tokens": 1908239.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 119.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02523590251803398, + "kl": 0.001289821113459766, + "learning_rate": 1.1776666666666668e-06, + "loss": 0.0001, + "num_tokens": 1908505.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 119.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2763098478317261, + "kl": 0.023756375536322594, + "learning_rate": 1.1773333333333333e-06, + "loss": 0.0012, + "num_tokens": 1908773.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 119.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03187099099159241, + "kl": 0.0006134450304671191, + "learning_rate": 1.177e-06, + "loss": 0.0, + "num_tokens": 1909029.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 27.0, + "completions/mean_terminated_length": 27.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 119.83333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.4135613441467285, + "kl": 0.18188875913619995, + "learning_rate": 1.1766666666666667e-06, + "loss": 0.2793, + "num_tokens": 1909377.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 6471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 119.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043207310140132904, + "kl": 0.00531318667344749, + "learning_rate": 1.1763333333333335e-06, + "loss": 0.0003, + "num_tokens": 1909673.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 119.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019358597695827484, + "kl": 0.00369433150626719, + "learning_rate": 1.176e-06, + "loss": 0.0002, + "num_tokens": 1909973.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 119.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052679259330034256, + "kl": 0.006142878322862089, + "learning_rate": 1.1756666666666666e-06, + "loss": 0.0003, + "num_tokens": 1910305.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 119.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05763982981443405, + "kl": 0.008953645825386047, + "learning_rate": 1.1753333333333334e-06, + "loss": 0.0004, + "num_tokens": 1910587.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 119.92592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.699066162109375, + "kl": 0.014039483503438532, + "learning_rate": 1.175e-06, + "loss": 0.0686, + "num_tokens": 1910918.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 6476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 119.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009389546699821949, + "kl": 0.001685982570052147, + "learning_rate": 1.1746666666666668e-06, + "loss": 0.0001, + "num_tokens": 1911230.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 119.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004241365008056164, + "kl": 0.0001102412716136314, + "learning_rate": 1.1743333333333333e-06, + "loss": 0.0, + "num_tokens": 1911502.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 119.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.8897757530212402, + "kl": 0.3033977091545239, + "learning_rate": 1.174e-06, + "loss": 0.0152, + "num_tokens": 1911762.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 120.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009460690431296825, + "kl": 0.00846131145954132, + "learning_rate": 1.1736666666666667e-06, + "loss": 0.0004, + "num_tokens": 1912034.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 120.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0077101378701627254, + "kl": 0.00013370811939239502, + "learning_rate": 1.1733333333333335e-06, + "loss": 0.0, + "num_tokens": 1912246.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6481 + }, + { + "clip_ratio/high_max": 0.0069444444961845875, + "clip_ratio/high_mean": 0.0069444444961845875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0069444444961845875, + "completion_length": 35.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 35.0, + "completions/mean_terminated_length": 35.0, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 120.03703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2743289470672607, + "kl": 0.1450158953666687, + "learning_rate": 1.173e-06, + "loss": -0.0709, + "num_tokens": 1912602.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 6482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 120.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06430388242006302, + "kl": 0.027581464499235153, + "learning_rate": 1.1726666666666668e-06, + "loss": 0.0014, + "num_tokens": 1912900.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 120.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03585256636142731, + "kl": 0.010957730002701283, + "learning_rate": 1.1723333333333334e-06, + "loss": 0.0006, + "num_tokens": 1913227.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 120.0925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.4235100746154785, + "kl": 0.03472807363141328, + "learning_rate": 1.172e-06, + "loss": 0.133, + "num_tokens": 1913531.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 6485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 2.5, + "completions/mean_terminated_length": 2.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 120.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02272755838930607, + "kl": 0.0005561560392379761, + "learning_rate": 1.1716666666666667e-06, + "loss": 0.0, + "num_tokens": 1913737.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 120.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0021256390027701855, + "kl": 0.001589758088812232, + "learning_rate": 1.1713333333333333e-06, + "loss": 0.0001, + "num_tokens": 1914011.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 120.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1417020708322525, + "kl": 0.05191383324563503, + "learning_rate": 1.1709999999999999e-06, + "loss": 0.0025, + "num_tokens": 1914344.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 120.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0343189463019371, + "kl": 0.011440465692430735, + "learning_rate": 1.1706666666666669e-06, + "loss": 0.0006, + "num_tokens": 1914638.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 120.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08473880589008331, + "kl": 0.019098554272204638, + "learning_rate": 1.1703333333333335e-06, + "loss": 0.001, + "num_tokens": 1914925.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 120.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020691394805908203, + "kl": 0.0007858893950469792, + "learning_rate": 1.17e-06, + "loss": 0.0, + "num_tokens": 1915205.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 120.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04949735105037689, + "kl": 0.0351734422147274, + "learning_rate": 1.1696666666666668e-06, + "loss": 0.0018, + "num_tokens": 1915479.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 120.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01649233140051365, + "kl": 0.00044474005699157715, + "learning_rate": 1.1693333333333334e-06, + "loss": 0.0, + "num_tokens": 1915691.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 33.25, + "completions/mean_terminated_length": 33.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 120.25925925925925, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.171275615692139, + "kl": 0.2138686180114746, + "learning_rate": 1.169e-06, + "loss": -0.0128, + "num_tokens": 1916052.0, + "reward": 5.625, + "reward_std": 2.462214469909668, + "rewards/reward_combined/mean": 5.625, + "rewards/reward_combined/std": 2.462214469909668, + "step": 6494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 120.27777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.692605972290039, + "kl": 0.013836213911417872, + "learning_rate": 1.1686666666666667e-06, + "loss": 0.0013, + "num_tokens": 1916312.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 6495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 120.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031527888029813766, + "kl": 0.0036845599533990026, + "learning_rate": 1.1683333333333333e-06, + "loss": 0.0002, + "num_tokens": 1916602.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 28.0, + "completions/mean_terminated_length": 28.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 120.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08643209934234619, + "kl": 0.04004262760281563, + "learning_rate": 1.1679999999999999e-06, + "loss": 0.0019, + "num_tokens": 1916934.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 120.33333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.578334808349609, + "kl": 0.05969178630039096, + "learning_rate": 1.1676666666666669e-06, + "loss": 0.0489, + "num_tokens": 1917277.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 6498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 120.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01702847331762314, + "kl": 0.09659398719668388, + "learning_rate": 1.1673333333333334e-06, + "loss": 0.0048, + "num_tokens": 1917649.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 120.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06658611446619034, + "kl": 0.01085315365344286, + "learning_rate": 1.167e-06, + "loss": 0.0005, + "num_tokens": 1917975.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 35.25, + "completions/mean_terminated_length": 35.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 120.38888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7283474206924438, + "kl": 0.05486566200852394, + "learning_rate": 1.1666666666666668e-06, + "loss": 0.1116, + "num_tokens": 1918396.0, + "reward": 2.174999952316284, + "reward_std": 1.649999976158142, + "rewards/reward_combined/mean": 2.174999952316284, + "rewards/reward_combined/std": 1.649999976158142, + "step": 6501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 120.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00709752831608057, + "kl": 0.00205912534147501, + "learning_rate": 1.1663333333333334e-06, + "loss": 0.0001, + "num_tokens": 1918708.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 24.25, + "completions/mean_terminated_length": 24.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 120.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026969702914357185, + "kl": 0.049299852922558784, + "learning_rate": 1.166e-06, + "loss": 0.0025, + "num_tokens": 1919041.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 120.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033657122403383255, + "kl": 0.001314066001214087, + "learning_rate": 1.1656666666666667e-06, + "loss": 0.0001, + "num_tokens": 1919276.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 120.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023198192939162254, + "kl": 0.0028923161153215915, + "learning_rate": 1.1653333333333333e-06, + "loss": 0.0001, + "num_tokens": 1919578.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 120.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031800609081983566, + "kl": 0.26318879425525665, + "learning_rate": 1.1649999999999999e-06, + "loss": 0.0132, + "num_tokens": 1919882.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 32.5, + "completions/mean_terminated_length": 32.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 120.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23323768377304077, + "kl": 0.05135101266205311, + "learning_rate": 1.1646666666666669e-06, + "loss": 0.0026, + "num_tokens": 1920236.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 120.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008731288253329694, + "kl": 3.1054019927978516e-05, + "learning_rate": 1.1643333333333334e-06, + "loss": 0.0, + "num_tokens": 1920456.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 6508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 120.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05076427012681961, + "kl": 0.002242106245830655, + "learning_rate": 1.164e-06, + "loss": 0.0001, + "num_tokens": 1920675.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 120.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03270020708441734, + "kl": 0.005240541649982333, + "learning_rate": 1.1636666666666668e-06, + "loss": 0.0003, + "num_tokens": 1920967.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 120.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008113679359667003, + "kl": 0.0037462636828422546, + "learning_rate": 1.1633333333333333e-06, + "loss": 0.0002, + "num_tokens": 1921203.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 120.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016736130928620696, + "kl": 0.00025835633277893066, + "learning_rate": 1.163e-06, + "loss": 0.0, + "num_tokens": 1921463.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 120.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0071660554967820644, + "kl": 0.0015529319643974304, + "learning_rate": 1.1626666666666667e-06, + "loss": 0.0001, + "num_tokens": 1921679.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 120.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06511753052473068, + "kl": 0.010602842550724745, + "learning_rate": 1.1623333333333333e-06, + "loss": 0.0005, + "num_tokens": 1921978.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 120.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029630031436681747, + "kl": 0.0008384265820495784, + "learning_rate": 1.162e-06, + "loss": 0.0, + "num_tokens": 1922221.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 120.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004593794234097004, + "kl": 0.00016801655146991834, + "learning_rate": 1.1616666666666668e-06, + "loss": 0.0, + "num_tokens": 1922481.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 120.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07340265065431595, + "kl": 0.01543547073379159, + "learning_rate": 1.1613333333333334e-06, + "loss": 0.0008, + "num_tokens": 1922753.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 120.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005003159400075674, + "kl": 0.0001290440595766995, + "learning_rate": 1.161e-06, + "loss": 0.0, + "num_tokens": 1923009.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 120.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07951806485652924, + "kl": 0.008560521760955453, + "learning_rate": 1.1606666666666668e-06, + "loss": 0.0004, + "num_tokens": 1923327.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 120.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018592054024338722, + "kl": 0.012788759544491768, + "learning_rate": 1.1603333333333333e-06, + "loss": 0.0006, + "num_tokens": 1923587.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 120.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02817356027662754, + "kl": 0.0023380888160318136, + "learning_rate": 1.16e-06, + "loss": 0.0001, + "num_tokens": 1923860.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 120.77777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.148828029632568, + "kl": 0.2056941445916891, + "learning_rate": 1.1596666666666667e-06, + "loss": 0.0948, + "num_tokens": 1924137.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 6522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 120.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10228823870420456, + "kl": 0.010372804943472147, + "learning_rate": 1.1593333333333333e-06, + "loss": 0.0005, + "num_tokens": 1924407.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 26.25, + "completions/mean_terminated_length": 26.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 120.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2594919800758362, + "kl": 0.03747236914932728, + "learning_rate": 1.159e-06, + "loss": 0.0022, + "num_tokens": 1924732.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 120.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03636854514479637, + "kl": 0.00693343306920724, + "learning_rate": 1.1586666666666668e-06, + "loss": 0.0004, + "num_tokens": 1925000.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 120.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005781413055956364, + "kl": 0.0004506640980252996, + "learning_rate": 1.1583333333333334e-06, + "loss": 0.0, + "num_tokens": 1925312.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 120.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041753239929676056, + "kl": 0.15194011479616165, + "learning_rate": 1.158e-06, + "loss": 0.0075, + "num_tokens": 1925631.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 120.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4301632046699524, + "kl": 0.05248394142836332, + "learning_rate": 1.1576666666666667e-06, + "loss": 0.003, + "num_tokens": 1925913.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 120.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045214906334877014, + "kl": 0.020316094160079956, + "learning_rate": 1.1573333333333333e-06, + "loss": 0.001, + "num_tokens": 1926213.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 120.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03149750456213951, + "kl": 0.0044946682173758745, + "learning_rate": 1.157e-06, + "loss": 0.0002, + "num_tokens": 1926481.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 120.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06995546817779541, + "kl": 0.010226914193481207, + "learning_rate": 1.1566666666666667e-06, + "loss": 0.0005, + "num_tokens": 1926769.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 27.0, + "completions/mean_terminated_length": 27.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 120.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14868246018886566, + "kl": 0.019098061602562666, + "learning_rate": 1.1563333333333332e-06, + "loss": 0.001, + "num_tokens": 1927105.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 120.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009264439344406128, + "kl": 0.0015327318105846643, + "learning_rate": 1.156e-06, + "loss": 0.0001, + "num_tokens": 1927387.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 121.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03600781783461571, + "kl": 0.00176073465263471, + "learning_rate": 1.1556666666666668e-06, + "loss": 0.0001, + "num_tokens": 1927708.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 121.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03034849837422371, + "kl": 0.0027270345017313957, + "learning_rate": 1.1553333333333334e-06, + "loss": 0.0001, + "num_tokens": 1927980.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 121.03703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.809662818908691, + "kl": 0.01322916243225336, + "learning_rate": 1.155e-06, + "loss": 0.0459, + "num_tokens": 1928308.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 6536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 121.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0033074396196752787, + "kl": 0.00017439574003219604, + "learning_rate": 1.1546666666666667e-06, + "loss": 0.0, + "num_tokens": 1928552.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 121.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012906003976240754, + "kl": 0.00033399835228919983, + "learning_rate": 1.1543333333333333e-06, + "loss": 0.0, + "num_tokens": 1928812.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 121.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08773849159479141, + "kl": 0.001112423837184906, + "learning_rate": 1.154e-06, + "loss": 0.0001, + "num_tokens": 1929024.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 121.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.141241654753685, + "kl": 0.04679136909544468, + "learning_rate": 1.1536666666666667e-06, + "loss": 0.0024, + "num_tokens": 1929341.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 121.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007866517407819629, + "kl": 0.003756478428840637, + "learning_rate": 1.1533333333333332e-06, + "loss": 0.0002, + "num_tokens": 1929577.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 121.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04516836255788803, + "kl": 0.012945299968123436, + "learning_rate": 1.153e-06, + "loss": 0.0006, + "num_tokens": 1929838.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 26.25, + "completions/mean_terminated_length": 26.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 121.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11207715421915054, + "kl": 0.058697886765003204, + "learning_rate": 1.1526666666666668e-06, + "loss": 0.0029, + "num_tokens": 1930179.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 121.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030223773792386055, + "kl": 0.2634342759847641, + "learning_rate": 1.1523333333333334e-06, + "loss": 0.0132, + "num_tokens": 1930483.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 121.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00621010409668088, + "kl": 0.00014281273070082534, + "learning_rate": 1.1520000000000002e-06, + "loss": 0.0, + "num_tokens": 1930739.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 121.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003828973858617246, + "kl": 0.001236374955624342, + "learning_rate": 1.1516666666666667e-06, + "loss": 0.0001, + "num_tokens": 1931019.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 121.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03259090706706047, + "kl": 0.00884171505458653, + "learning_rate": 1.1513333333333333e-06, + "loss": 0.0004, + "num_tokens": 1931311.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 121.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03392448276281357, + "kl": 0.0034284861758351326, + "learning_rate": 1.151e-06, + "loss": 0.0002, + "num_tokens": 1931641.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 121.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03775680437684059, + "kl": 0.03889380767941475, + "learning_rate": 1.1506666666666666e-06, + "loss": 0.0019, + "num_tokens": 1931917.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 121.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12175801396369934, + "kl": 0.010034102015197277, + "learning_rate": 1.1503333333333332e-06, + "loss": 0.0005, + "num_tokens": 1932217.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 121.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02031847834587097, + "kl": 0.006155602788567194, + "learning_rate": 1.1500000000000002e-06, + "loss": 0.0003, + "num_tokens": 1932489.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 121.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024707650765776634, + "kl": 0.0016802847385406494, + "learning_rate": 1.1496666666666668e-06, + "loss": 0.0001, + "num_tokens": 1932801.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 35.0, + "completions/mean_terminated_length": 35.0, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 121.35185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.629100799560547, + "kl": 0.19461789727210999, + "learning_rate": 1.1493333333333334e-06, + "loss": 0.0063, + "num_tokens": 1933169.0, + "reward": 6.625, + "reward_std": 2.428133726119995, + "rewards/reward_combined/mean": 6.625, + "rewards/reward_combined/std": 2.428133726119995, + "step": 6553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 121.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10235032439231873, + "kl": 0.02571127749979496, + "learning_rate": 1.1490000000000001e-06, + "loss": 0.0014, + "num_tokens": 1933459.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 121.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020946571603417397, + "kl": 0.0007784941699355841, + "learning_rate": 1.1486666666666667e-06, + "loss": 0.0, + "num_tokens": 1933693.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 121.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010214767418801785, + "kl": 0.0001605344732524827, + "learning_rate": 1.1483333333333333e-06, + "loss": 0.0, + "num_tokens": 1933973.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 121.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07568927109241486, + "kl": 0.012763059698045254, + "learning_rate": 1.148e-06, + "loss": 0.0006, + "num_tokens": 1934300.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 121.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029653701931238174, + "kl": 0.0013552189921028912, + "learning_rate": 1.1476666666666666e-06, + "loss": 0.0001, + "num_tokens": 1934625.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 121.46296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.533745527267456, + "kl": 0.026880485005676746, + "learning_rate": 1.1473333333333332e-06, + "loss": 0.1205, + "num_tokens": 1934906.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 6559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 121.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16515734791755676, + "kl": 0.02492852951399982, + "learning_rate": 1.1470000000000002e-06, + "loss": 0.0011, + "num_tokens": 1935246.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 121.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10162846744060516, + "kl": 0.010246307007037103, + "learning_rate": 1.1466666666666668e-06, + "loss": 0.0005, + "num_tokens": 1935506.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 121.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019135337322950363, + "kl": 0.0056260202545672655, + "learning_rate": 1.1463333333333333e-06, + "loss": 0.0003, + "num_tokens": 1935772.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 121.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009062511846423149, + "kl": 0.008659596554934978, + "learning_rate": 1.1460000000000001e-06, + "loss": 0.0004, + "num_tokens": 1936044.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 121.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055110394954681396, + "kl": 0.012972465250641108, + "learning_rate": 1.1456666666666667e-06, + "loss": 0.0007, + "num_tokens": 1936372.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 121.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004890459589660168, + "kl": 0.00022017360606696457, + "learning_rate": 1.1453333333333333e-06, + "loss": 0.0, + "num_tokens": 1936632.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 121.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027844659984111786, + "kl": 0.004094152478501201, + "learning_rate": 1.145e-06, + "loss": 0.0002, + "num_tokens": 1936922.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 121.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12352059036493301, + "kl": 0.007494664052501321, + "learning_rate": 1.1446666666666666e-06, + "loss": 0.0004, + "num_tokens": 1937182.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 121.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015663154423236847, + "kl": 0.0023040270316414535, + "learning_rate": 1.1443333333333332e-06, + "loss": 0.0001, + "num_tokens": 1937478.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 121.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011276880279183388, + "kl": 0.15838884562253952, + "learning_rate": 1.1440000000000002e-06, + "loss": 0.0079, + "num_tokens": 1937788.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 121.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022900735959410667, + "kl": 0.0010702908039093018, + "learning_rate": 1.1436666666666668e-06, + "loss": 0.0001, + "num_tokens": 1938000.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 121.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04618305340409279, + "kl": 0.002308756113052368, + "learning_rate": 1.1433333333333333e-06, + "loss": 0.0001, + "num_tokens": 1938204.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 121.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12868893146514893, + "kl": 0.032958365976810455, + "learning_rate": 1.1430000000000001e-06, + "loss": 0.0017, + "num_tokens": 1938505.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 121.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0051928190514445305, + "kl": 0.00048614738625474274, + "learning_rate": 1.1426666666666667e-06, + "loss": 0.0, + "num_tokens": 1938725.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 121.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017940184101462364, + "kl": 0.09643261134624481, + "learning_rate": 1.1423333333333333e-06, + "loss": 0.0048, + "num_tokens": 1939097.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 37.75, + "completions/mean_terminated_length": 37.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 121.75925925925925, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6241710186004639, + "kl": 0.07638740912079811, + "learning_rate": 1.142e-06, + "loss": 0.0086, + "num_tokens": 1939464.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 6575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.75, + "completions/mean_terminated_length": 12.75, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 121.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14684060215950012, + "kl": 0.014174801646731794, + "learning_rate": 1.1416666666666666e-06, + "loss": 0.0007, + "num_tokens": 1939747.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 121.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07468777149915695, + "kl": 0.019712856505066156, + "learning_rate": 1.1413333333333334e-06, + "loss": 0.001, + "num_tokens": 1940048.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 35.75, + "completions/mean_terminated_length": 35.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 121.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10913591086864471, + "kl": 0.03890548646450043, + "learning_rate": 1.1410000000000002e-06, + "loss": 0.0019, + "num_tokens": 1940471.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 6578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 47.0, + "completions/max_terminated_length": 47.0, + "completions/mean_length": 35.0, + "completions/mean_terminated_length": 35.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 121.83333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.116952896118164, + "kl": 0.08014311641454697, + "learning_rate": 1.1406666666666667e-06, + "loss": 0.1064, + "num_tokens": 1940835.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 6579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 121.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06265262514352798, + "kl": 0.010103950276970863, + "learning_rate": 1.1403333333333333e-06, + "loss": 0.0005, + "num_tokens": 1941132.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 121.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.105821393430233, + "kl": 0.04428875632584095, + "learning_rate": 1.14e-06, + "loss": 0.0022, + "num_tokens": 1941484.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 121.88888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.788204669952393, + "kl": 0.0600991346873343, + "learning_rate": 1.1396666666666667e-06, + "loss": 0.0432, + "num_tokens": 1941785.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 6582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 121.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018679805099964142, + "kl": 0.005518015008419752, + "learning_rate": 1.1393333333333332e-06, + "loss": 0.0003, + "num_tokens": 1942053.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 121.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007013689610175788, + "kl": 2.3759901523590088e-05, + "learning_rate": 1.139e-06, + "loss": 0.0, + "num_tokens": 1942273.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 6584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 121.94444444444444, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.533872604370117, + "kl": 0.07464139349758625, + "learning_rate": 1.1386666666666666e-06, + "loss": -0.1319, + "num_tokens": 1942593.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 6585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 121.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038593973964452744, + "kl": 0.0026437936176080257, + "learning_rate": 1.1383333333333334e-06, + "loss": 0.0001, + "num_tokens": 1942899.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 121.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13153290748596191, + "kl": 0.008460809476673603, + "learning_rate": 1.1380000000000002e-06, + "loss": 0.0005, + "num_tokens": 1943126.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 122.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.673837900161743, + "kl": 0.02412175014615059, + "learning_rate": 1.1376666666666667e-06, + "loss": -0.0391, + "num_tokens": 1943392.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 6588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 122.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.269593209028244, + "kl": 0.022401707246899605, + "learning_rate": 1.1373333333333333e-06, + "loss": 0.0012, + "num_tokens": 1943654.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 122.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007820813334546983, + "kl": 0.003760233521461487, + "learning_rate": 1.137e-06, + "loss": 0.0002, + "num_tokens": 1943890.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 122.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017251137644052505, + "kl": 0.0036438003880903125, + "learning_rate": 1.1366666666666667e-06, + "loss": 0.0002, + "num_tokens": 1944158.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 122.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09193898737430573, + "kl": 0.008601611480116844, + "learning_rate": 1.1363333333333334e-06, + "loss": 0.0004, + "num_tokens": 1944451.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 122.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01775575429201126, + "kl": 0.09643914923071861, + "learning_rate": 1.136e-06, + "loss": 0.0048, + "num_tokens": 1944823.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 122.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004691155627369881, + "kl": 0.0015607811510562897, + "learning_rate": 1.1356666666666666e-06, + "loss": 0.0001, + "num_tokens": 1945135.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 37.25, + "completions/mean_terminated_length": 37.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 122.12962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6137292385101318, + "kl": 0.1320759579539299, + "learning_rate": 1.1353333333333334e-06, + "loss": -0.0575, + "num_tokens": 1945512.0, + "reward": 7.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.875, + "rewards/reward_combined/std": 0.25, + "step": 6595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 122.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005975870881229639, + "kl": 0.001468625690904446, + "learning_rate": 1.1350000000000001e-06, + "loss": 0.0001, + "num_tokens": 1945731.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 122.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037724416702985764, + "kl": 0.0019186652498319745, + "learning_rate": 1.1346666666666667e-06, + "loss": 0.0001, + "num_tokens": 1946058.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 122.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03191739693284035, + "kl": 0.000301949679851532, + "learning_rate": 1.1343333333333333e-06, + "loss": 0.0, + "num_tokens": 1946270.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 12.75, + "completions/mean_terminated_length": 12.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 122.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.43631380796432495, + "kl": 0.026331719011068344, + "learning_rate": 1.134e-06, + "loss": 0.0017, + "num_tokens": 1946541.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 122.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019121550023555756, + "kl": 0.0008227803918998688, + "learning_rate": 1.1336666666666666e-06, + "loss": 0.0, + "num_tokens": 1946776.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 122.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03276032581925392, + "kl": 0.26299040019512177, + "learning_rate": 1.1333333333333334e-06, + "loss": 0.0131, + "num_tokens": 1947080.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 122.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026009004563093185, + "kl": 0.0013636148360092193, + "learning_rate": 1.133e-06, + "loss": 0.0001, + "num_tokens": 1947398.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 122.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036825548857450485, + "kl": 0.005071159917861223, + "learning_rate": 1.1326666666666666e-06, + "loss": 0.0003, + "num_tokens": 1947686.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 122.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.36840197443962097, + "kl": 0.05935653671622276, + "learning_rate": 1.1323333333333333e-06, + "loss": 0.003, + "num_tokens": 1947962.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 122.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017999956384301186, + "kl": 0.012935727834701538, + "learning_rate": 1.1320000000000001e-06, + "loss": 0.0006, + "num_tokens": 1948222.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 122.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01400921493768692, + "kl": 0.00029931643803138286, + "learning_rate": 1.1316666666666667e-06, + "loss": 0.0, + "num_tokens": 1948465.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 122.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001294066198170185, + "kl": 4.2216479414491914e-05, + "learning_rate": 1.1313333333333335e-06, + "loss": 0.0, + "num_tokens": 1948733.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 122.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07810121774673462, + "kl": 0.020303184166550636, + "learning_rate": 1.131e-06, + "loss": 0.0011, + "num_tokens": 1949017.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 122.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1420530080795288, + "kl": 0.03908430226147175, + "learning_rate": 1.1306666666666666e-06, + "loss": 0.002, + "num_tokens": 1949322.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 122.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.29549914598464966, + "kl": 0.027963336557149887, + "learning_rate": 1.1303333333333334e-06, + "loss": 0.0014, + "num_tokens": 1949588.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 122.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02412734553217888, + "kl": 0.001976420055143535, + "learning_rate": 1.13e-06, + "loss": 0.0001, + "num_tokens": 1949868.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 122.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000716892653144896, + "kl": 2.3633241653442383e-05, + "learning_rate": 1.1296666666666666e-06, + "loss": 0.0, + "num_tokens": 1950088.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 6612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 122.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17835897207260132, + "kl": 0.023855048464611173, + "learning_rate": 1.1293333333333333e-06, + "loss": 0.0013, + "num_tokens": 1950428.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 122.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06581725180149078, + "kl": 0.011161921545863152, + "learning_rate": 1.1290000000000001e-06, + "loss": 0.0006, + "num_tokens": 1950757.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 122.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.9422038793563843, + "kl": 0.19157817773520947, + "learning_rate": 1.1286666666666667e-06, + "loss": 0.0095, + "num_tokens": 1951112.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 122.51851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.47744083404541, + "kl": 0.029025439638644457, + "learning_rate": 1.1283333333333335e-06, + "loss": 0.0026, + "num_tokens": 1951441.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 6616 + }, + { + "clip_ratio/high_max": 0.009803921915590763, + "clip_ratio/high_mean": 0.009803921915590763, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009803921915590763, + "completion_length": 28.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 28.75, + "completions/mean_terminated_length": 28.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 122.53703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.328023910522461, + "kl": 0.16562017053365707, + "learning_rate": 1.128e-06, + "loss": 0.1351, + "num_tokens": 1951792.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 6617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 122.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016530541703104973, + "kl": 0.15855525434017181, + "learning_rate": 1.1276666666666666e-06, + "loss": 0.0079, + "num_tokens": 1952102.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 122.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03119640238583088, + "kl": 0.00044555962085723877, + "learning_rate": 1.1273333333333334e-06, + "loss": 0.0, + "num_tokens": 1952310.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 122.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07498772442340851, + "kl": 0.026659665629267693, + "learning_rate": 1.127e-06, + "loss": 0.0013, + "num_tokens": 1952614.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 122.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8686230182647705, + "kl": 0.007708510383963585, + "learning_rate": 1.1266666666666665e-06, + "loss": 0.0326, + "num_tokens": 1952955.0, + "reward": 6.125, + "reward_std": 3.75, + "rewards/reward_combined/mean": 6.125, + "rewards/reward_combined/std": 3.75, + "step": 6621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 122.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06849901378154755, + "kl": 0.011428920784965158, + "learning_rate": 1.1263333333333335e-06, + "loss": 0.0005, + "num_tokens": 1953233.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 122.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05124097689986229, + "kl": 0.004757784656248987, + "learning_rate": 1.126e-06, + "loss": 0.0002, + "num_tokens": 1953533.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.75, + "completions/mean_terminated_length": 10.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 122.66666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 11.247563362121582, + "kl": 0.015773415565490723, + "learning_rate": 1.1256666666666667e-06, + "loss": -0.0334, + "num_tokens": 1953804.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 6624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 122.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037757158279418945, + "kl": 0.000987064908258617, + "learning_rate": 1.1253333333333335e-06, + "loss": 0.0, + "num_tokens": 1954102.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 122.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006865231785923243, + "kl": 0.0014312155544757843, + "learning_rate": 1.125e-06, + "loss": 0.0001, + "num_tokens": 1954379.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 122.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05403284728527069, + "kl": 0.04023777320981026, + "learning_rate": 1.1246666666666666e-06, + "loss": 0.002, + "num_tokens": 1954783.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 6627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 39.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 39.25, + "completions/mean_terminated_length": 39.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 122.74074074074075, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3230810165405273, + "kl": 0.04933694563806057, + "learning_rate": 1.1243333333333334e-06, + "loss": 0.1275, + "num_tokens": 1955164.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 6628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 122.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016358578577637672, + "kl": 0.00044018030166625977, + "learning_rate": 1.124e-06, + "loss": 0.0, + "num_tokens": 1955376.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 122.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040608134120702744, + "kl": 0.013359877280890942, + "learning_rate": 1.1236666666666665e-06, + "loss": 0.0007, + "num_tokens": 1955679.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 122.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0580645315349102, + "kl": 0.005300495307892561, + "learning_rate": 1.1233333333333335e-06, + "loss": 0.0004, + "num_tokens": 1955917.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 29.0, + "completions/mean_terminated_length": 29.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 122.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07330907881259918, + "kl": 0.019006874412298203, + "learning_rate": 1.123e-06, + "loss": 0.001, + "num_tokens": 1956261.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 122.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07663213461637497, + "kl": 0.020455674966797233, + "learning_rate": 1.1226666666666667e-06, + "loss": 0.0011, + "num_tokens": 1956548.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 122.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0524519719183445, + "kl": 0.0009693175088614225, + "learning_rate": 1.1223333333333334e-06, + "loss": 0.0, + "num_tokens": 1956804.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 122.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020544296130537987, + "kl": 0.0055718638468533754, + "learning_rate": 1.122e-06, + "loss": 0.0003, + "num_tokens": 1957072.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 122.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07634744793176651, + "kl": 0.009925136109814048, + "learning_rate": 1.1216666666666666e-06, + "loss": 0.0005, + "num_tokens": 1957372.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 122.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004817161243408918, + "kl": 0.00021977425058139488, + "learning_rate": 1.1213333333333334e-06, + "loss": 0.0, + "num_tokens": 1957632.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 122.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011919735930860043, + "kl": 0.0002291479249834083, + "learning_rate": 1.121e-06, + "loss": 0.0, + "num_tokens": 1957912.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 36.0, + "completions/mean_terminated_length": 36.0, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 122.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20443905889987946, + "kl": 0.07632733508944511, + "learning_rate": 1.1206666666666667e-06, + "loss": 0.004, + "num_tokens": 1958272.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 122.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025047261267900467, + "kl": 0.0018797516240738332, + "learning_rate": 1.1203333333333335e-06, + "loss": 0.0001, + "num_tokens": 1958532.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 122.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06778904050588608, + "kl": 0.009510488947853446, + "learning_rate": 1.12e-06, + "loss": 0.0005, + "num_tokens": 1958858.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 23.75, + "completions/mean_terminated_length": 23.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 123.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.01747989654541, + "kl": 0.01983852032572031, + "learning_rate": 1.1196666666666666e-06, + "loss": 0.1248, + "num_tokens": 1959173.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 6642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 123.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05625295639038086, + "kl": 0.05622301809489727, + "learning_rate": 1.1193333333333334e-06, + "loss": 0.0028, + "num_tokens": 1959509.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 123.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021355150267481804, + "kl": 0.0058467877097427845, + "learning_rate": 1.119e-06, + "loss": 0.0003, + "num_tokens": 1959777.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 123.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02639990858733654, + "kl": 0.0004590049502439797, + "learning_rate": 1.1186666666666666e-06, + "loss": 0.0, + "num_tokens": 1959990.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 123.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14955127239227295, + "kl": 0.03589828871190548, + "learning_rate": 1.1183333333333334e-06, + "loss": 0.0018, + "num_tokens": 1960314.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 123.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007937393384054303, + "kl": 0.0037530064582824707, + "learning_rate": 1.118e-06, + "loss": 0.0002, + "num_tokens": 1960550.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 123.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07580048590898514, + "kl": 0.01687697321176529, + "learning_rate": 1.1176666666666667e-06, + "loss": 0.0009, + "num_tokens": 1960834.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 123.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016741666477173567, + "kl": 0.00028426945209503174, + "learning_rate": 1.1173333333333335e-06, + "loss": 0.0, + "num_tokens": 1961148.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 123.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021388089284300804, + "kl": 0.0009245864348486066, + "learning_rate": 1.117e-06, + "loss": 0.0, + "num_tokens": 1961468.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 123.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14308567345142365, + "kl": 0.04327143356204033, + "learning_rate": 1.1166666666666666e-06, + "loss": 0.0021, + "num_tokens": 1961821.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 123.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09517300128936768, + "kl": 0.011151891900226474, + "learning_rate": 1.1163333333333334e-06, + "loss": 0.0006, + "num_tokens": 1962152.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 123.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.092129185795784, + "kl": 0.01014585793018341, + "learning_rate": 1.116e-06, + "loss": 0.0005, + "num_tokens": 1962434.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 27.25, + "completions/mean_terminated_length": 27.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 123.22222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0957694053649902, + "kl": 0.047333190217614174, + "learning_rate": 1.1156666666666668e-06, + "loss": -0.0042, + "num_tokens": 1962771.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 6654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 123.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06209235265851021, + "kl": 0.004939502105116844, + "learning_rate": 1.1153333333333333e-06, + "loss": 0.0003, + "num_tokens": 1963033.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 123.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030932197347283363, + "kl": 0.0024331025779247284, + "learning_rate": 1.115e-06, + "loss": 0.0001, + "num_tokens": 1963345.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 39.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 39.5, + "completions/mean_terminated_length": 39.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 123.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.29408520460128784, + "kl": 0.062339795753359795, + "learning_rate": 1.1146666666666667e-06, + "loss": 0.004, + "num_tokens": 1963723.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 123.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16003206372261047, + "kl": 0.011077792500145733, + "learning_rate": 1.1143333333333335e-06, + "loss": 0.0006, + "num_tokens": 1963997.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 123.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027227364480495453, + "kl": 0.047547515481710434, + "learning_rate": 1.114e-06, + "loss": 0.0024, + "num_tokens": 1964401.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 6659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.5, + "completions/mean_terminated_length": 4.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 123.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06734632700681686, + "kl": 0.002562224864959717, + "learning_rate": 1.1136666666666666e-06, + "loss": 0.0001, + "num_tokens": 1964615.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 123.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013867117464542389, + "kl": 0.0001319587172474712, + "learning_rate": 1.1133333333333334e-06, + "loss": 0.0, + "num_tokens": 1964871.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 123.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03693933039903641, + "kl": 0.12788153439760208, + "learning_rate": 1.113e-06, + "loss": 0.0064, + "num_tokens": 1965180.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 123.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12861475348472595, + "kl": 0.010458302684128284, + "learning_rate": 1.1126666666666668e-06, + "loss": 0.0005, + "num_tokens": 1965446.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 38.25, + "completions/mean_terminated_length": 38.25, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 123.4074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.110867977142334, + "kl": 0.12117099389433861, + "learning_rate": 1.1123333333333333e-06, + "loss": 0.0026, + "num_tokens": 1965815.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 6664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 123.42592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3038673400878906, + "kl": 0.3133438229560852, + "learning_rate": 1.112e-06, + "loss": 0.03, + "num_tokens": 1966120.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 6665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 123.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029912354424595833, + "kl": 0.0029180452693253756, + "learning_rate": 1.1116666666666667e-06, + "loss": 0.0001, + "num_tokens": 1966400.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 123.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00737079419195652, + "kl": 9.790012700250372e-05, + "learning_rate": 1.1113333333333335e-06, + "loss": 0.0, + "num_tokens": 1966670.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 123.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013049686327576637, + "kl": 0.0009212995209963992, + "learning_rate": 1.111e-06, + "loss": 0.0, + "num_tokens": 1966966.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 123.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08290422707796097, + "kl": 0.007014411268755794, + "learning_rate": 1.1106666666666668e-06, + "loss": 0.0004, + "num_tokens": 1967256.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 38.5, + "completions/mean_terminated_length": 38.5, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 123.51851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3177599906921387, + "kl": 0.07530639320611954, + "learning_rate": 1.1103333333333334e-06, + "loss": 0.0307, + "num_tokens": 1967638.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 6670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 123.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012634089216589928, + "kl": 0.002322533298865892, + "learning_rate": 1.11e-06, + "loss": 0.0001, + "num_tokens": 1967904.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 123.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08547884225845337, + "kl": 0.03303547203540802, + "learning_rate": 1.1096666666666667e-06, + "loss": 0.0016, + "num_tokens": 1968222.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6672 + }, + { + "clip_ratio/high_max": 0.013513513840734959, + "clip_ratio/high_mean": 0.013513513840734959, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.013513513840734959, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 123.57407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.624612331390381, + "kl": 0.03873021062463522, + "learning_rate": 1.1093333333333333e-06, + "loss": -0.0982, + "num_tokens": 1968537.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 6673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 123.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04499563202261925, + "kl": 0.005090321647003293, + "learning_rate": 1.1089999999999999e-06, + "loss": 0.0003, + "num_tokens": 1968869.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 123.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12638144195079803, + "kl": 0.035763099789619446, + "learning_rate": 1.1086666666666667e-06, + "loss": 0.0019, + "num_tokens": 1969158.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 123.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0042531960643827915, + "kl": 0.0002468675374984741, + "learning_rate": 1.1083333333333335e-06, + "loss": 0.0, + "num_tokens": 1969418.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 123.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0449579656124115, + "kl": 0.0025136874173767865, + "learning_rate": 1.108e-06, + "loss": 0.0001, + "num_tokens": 1969718.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 123.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037679605185985565, + "kl": 0.002508559846319258, + "learning_rate": 1.1076666666666668e-06, + "loss": 0.0001, + "num_tokens": 1969978.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 123.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0029916069470345974, + "kl": 0.00013575702905654907, + "learning_rate": 1.1073333333333334e-06, + "loss": 0.0, + "num_tokens": 1970222.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 123.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02202909253537655, + "kl": 0.0021343620028346777, + "learning_rate": 1.107e-06, + "loss": 0.0001, + "num_tokens": 1970550.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 123.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017070196568965912, + "kl": 0.013190580997616053, + "learning_rate": 1.1066666666666667e-06, + "loss": 0.0007, + "num_tokens": 1970810.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 30.0, + "completions/mean_terminated_length": 30.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 123.74074074074075, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.008839130401611, + "kl": 0.040121917612850666, + "learning_rate": 1.1063333333333333e-06, + "loss": 0.2553, + "num_tokens": 1971154.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 6682 + }, + { + "clip_ratio/high_max": 0.006493506487458944, + "clip_ratio/high_mean": 0.006493506487458944, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006493506487458944, + "completion_length": 36.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 36.5, + "completions/mean_terminated_length": 36.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 123.75925925925925, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4907705783843994, + "kl": 0.07521631568670273, + "learning_rate": 1.1059999999999999e-06, + "loss": 0.0749, + "num_tokens": 1971524.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 6683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 123.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01797998696565628, + "kl": 0.0024140363093465567, + "learning_rate": 1.1056666666666669e-06, + "loss": 0.0001, + "num_tokens": 1971806.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 123.79629629629629, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.226508140563965, + "kl": 0.07636481896042824, + "learning_rate": 1.1053333333333334e-06, + "loss": 0.0038, + "num_tokens": 1972109.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 6685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 123.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03057272918522358, + "kl": 0.0019758939743041992, + "learning_rate": 1.105e-06, + "loss": 0.0001, + "num_tokens": 1972381.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.75, + "completions/mean_terminated_length": 5.75, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 123.83333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 12.118617057800293, + "kl": 0.06876134339836426, + "learning_rate": 1.1046666666666668e-06, + "loss": -0.2826, + "num_tokens": 1972612.0, + "reward": 3.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 3.875, + "rewards/reward_combined/std": 0.25, + "step": 6687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 123.85185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.537738800048828, + "kl": 0.03233519662171602, + "learning_rate": 1.1043333333333334e-06, + "loss": 0.0307, + "num_tokens": 1972886.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 6688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 123.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013402629643678665, + "kl": 0.00030046701431274414, + "learning_rate": 1.104e-06, + "loss": 0.0, + "num_tokens": 1973098.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 123.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018313413485884666, + "kl": 0.09641879796981812, + "learning_rate": 1.1036666666666667e-06, + "loss": 0.0048, + "num_tokens": 1973470.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 123.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027211032807826996, + "kl": 0.006547056371346116, + "learning_rate": 1.1033333333333333e-06, + "loss": 0.0003, + "num_tokens": 1973760.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 123.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0076531157828867435, + "kl": 0.0016411244869232178, + "learning_rate": 1.1029999999999999e-06, + "loss": 0.0001, + "num_tokens": 1973976.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 123.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11016985774040222, + "kl": 0.0048295200685970485, + "learning_rate": 1.1026666666666669e-06, + "loss": 0.0002, + "num_tokens": 1974195.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 123.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009899057913571596, + "kl": 3.697723150253296e-05, + "learning_rate": 1.1023333333333334e-06, + "loss": 0.0, + "num_tokens": 1974415.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 6694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 123.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3061322569847107, + "kl": 0.05900104157626629, + "learning_rate": 1.102e-06, + "loss": 0.0029, + "num_tokens": 1974708.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 124.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009826003573834896, + "kl": 0.0037278781237546355, + "learning_rate": 1.1016666666666668e-06, + "loss": 0.0002, + "num_tokens": 1974968.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 124.01851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.27171516418457, + "kl": 0.16502747312188148, + "learning_rate": 1.1013333333333333e-06, + "loss": 0.0402, + "num_tokens": 1975297.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 6697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 124.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0068075633607804775, + "kl": 0.0001447594549972564, + "learning_rate": 1.101e-06, + "loss": 0.0, + "num_tokens": 1975540.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 124.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11371287703514099, + "kl": 0.016994278877973557, + "learning_rate": 1.1006666666666667e-06, + "loss": 0.0008, + "num_tokens": 1975833.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 124.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03819449245929718, + "kl": 0.04529155418276787, + "learning_rate": 1.1003333333333333e-06, + "loss": 0.0023, + "num_tokens": 1976237.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 6700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 124.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007268072804436088, + "kl": 0.0037770047783851624, + "learning_rate": 1.0999999999999998e-06, + "loss": 0.0002, + "num_tokens": 1976473.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 124.11111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.014118671417236, + "kl": 0.02089185267686844, + "learning_rate": 1.0996666666666668e-06, + "loss": 0.1583, + "num_tokens": 1976757.0, + "reward": 5.875, + "reward_std": 3.25, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 3.25, + "step": 6702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 124.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.267642617225647, + "kl": 0.024595767725259066, + "learning_rate": 1.0993333333333334e-06, + "loss": 0.0014, + "num_tokens": 1977095.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 124.14814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4866061210632324, + "kl": 0.014712004223838449, + "learning_rate": 1.099e-06, + "loss": 0.028, + "num_tokens": 1977413.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 6704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 124.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09291088581085205, + "kl": 0.05437632463872433, + "learning_rate": 1.0986666666666668e-06, + "loss": 0.0027, + "num_tokens": 1977752.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 124.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019979797303676605, + "kl": 0.006237521922230371, + "learning_rate": 1.0983333333333333e-06, + "loss": 0.0003, + "num_tokens": 1978024.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 124.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01857931725680828, + "kl": 0.0018653283914318308, + "learning_rate": 1.098e-06, + "loss": 0.0001, + "num_tokens": 1978294.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 124.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20679014921188354, + "kl": 0.018989129282999784, + "learning_rate": 1.0976666666666667e-06, + "loss": 0.001, + "num_tokens": 1978624.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 124.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00860736332833767, + "kl": 0.26743079721927643, + "learning_rate": 1.0973333333333333e-06, + "loss": 0.0134, + "num_tokens": 1978928.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6709 + }, + { + "clip_ratio/high_max": 0.017241379246115685, + "clip_ratio/high_mean": 0.017241379246115685, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.017241379246115685, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 124.25925925925925, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3299930095672607, + "kl": 0.037656157510355115, + "learning_rate": 1.097e-06, + "loss": -0.0178, + "num_tokens": 1979229.0, + "reward": 5.75, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 4.5, + "step": 6710 + }, + { + "clip_ratio/high_max": 0.011904762126505375, + "clip_ratio/high_mean": 0.011904762126505375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.011904762126505375, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 124.27777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.283064842224121, + "kl": 0.055986179038882256, + "learning_rate": 1.0966666666666668e-06, + "loss": 0.0638, + "num_tokens": 1979538.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 6711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 124.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14586737751960754, + "kl": 0.022765167523175478, + "learning_rate": 1.0963333333333334e-06, + "loss": 0.0012, + "num_tokens": 1979859.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 124.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03695819154381752, + "kl": 0.010335276369005442, + "learning_rate": 1.096e-06, + "loss": 0.0005, + "num_tokens": 1980153.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 124.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035331230610609055, + "kl": 0.0004420280456542969, + "learning_rate": 1.0956666666666668e-06, + "loss": 0.0, + "num_tokens": 1980409.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 124.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007162205874919891, + "kl": 2.3193657398223877e-05, + "learning_rate": 1.0953333333333333e-06, + "loss": 0.0, + "num_tokens": 1980629.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 6715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 124.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3293073773384094, + "kl": 0.04378224955871701, + "learning_rate": 1.095e-06, + "loss": 0.0023, + "num_tokens": 1980909.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 124.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03382283076643944, + "kl": 0.013266139198094606, + "learning_rate": 1.0946666666666667e-06, + "loss": 0.0007, + "num_tokens": 1981183.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 38.25, + "completions/mean_terminated_length": 38.25, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 124.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6197506785392761, + "kl": 0.0986871924251318, + "learning_rate": 1.0943333333333332e-06, + "loss": 0.005, + "num_tokens": 1981552.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 124.42592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4851492643356323, + "kl": 0.42122837249189615, + "learning_rate": 1.094e-06, + "loss": 0.0094, + "num_tokens": 1981881.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 6719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 124.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05468791350722313, + "kl": 0.005747489631175995, + "learning_rate": 1.0936666666666668e-06, + "loss": 0.0003, + "num_tokens": 1982097.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 124.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01509421318769455, + "kl": 0.16073846071958542, + "learning_rate": 1.0933333333333334e-06, + "loss": 0.008, + "num_tokens": 1982406.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 124.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0060296026058495045, + "kl": 0.00043669344449881464, + "learning_rate": 1.093e-06, + "loss": 0.0, + "num_tokens": 1982666.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.25, + "completions/mean_terminated_length": 6.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 124.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19843657314777374, + "kl": 0.011688372935168445, + "learning_rate": 1.0926666666666667e-06, + "loss": 0.0006, + "num_tokens": 1982899.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 36.5, + "completions/mean_terminated_length": 36.5, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 124.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12993422150611877, + "kl": 0.0359414704144001, + "learning_rate": 1.0923333333333333e-06, + "loss": 0.0018, + "num_tokens": 1983269.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 124.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013794119469821453, + "kl": 0.001781273982487619, + "learning_rate": 1.092e-06, + "loss": 0.0001, + "num_tokens": 1983551.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 124.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15650252997875214, + "kl": 0.01949635287746787, + "learning_rate": 1.0916666666666667e-06, + "loss": 0.0012, + "num_tokens": 1983819.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 124.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02222519740462303, + "kl": 0.0018115078564733267, + "learning_rate": 1.0913333333333332e-06, + "loss": 0.0001, + "num_tokens": 1984097.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 124.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030690694227814674, + "kl": 0.002545369789004326, + "learning_rate": 1.091e-06, + "loss": 0.0001, + "num_tokens": 1984379.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 12.75, + "completions/mean_terminated_length": 12.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 124.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.276077747344971, + "kl": 0.02794385515153408, + "learning_rate": 1.0906666666666668e-06, + "loss": 0.3721, + "num_tokens": 1984650.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 6729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 124.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01635040156543255, + "kl": 0.0005342587828636169, + "learning_rate": 1.0903333333333334e-06, + "loss": 0.0, + "num_tokens": 1984860.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 124.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04845886304974556, + "kl": 0.011965001933276653, + "learning_rate": 1.0900000000000002e-06, + "loss": 0.0006, + "num_tokens": 1985166.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 124.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03365279734134674, + "kl": 0.0017847068229457363, + "learning_rate": 1.0896666666666667e-06, + "loss": 0.0001, + "num_tokens": 1985472.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 124.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03229230269789696, + "kl": 0.005544868065044284, + "learning_rate": 1.0893333333333333e-06, + "loss": 0.0003, + "num_tokens": 1985763.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 124.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1267152577638626, + "kl": 0.009396258043125272, + "learning_rate": 1.089e-06, + "loss": 0.0005, + "num_tokens": 1986090.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 124.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09310396760702133, + "kl": 0.034787725657224655, + "learning_rate": 1.0886666666666666e-06, + "loss": 0.0017, + "num_tokens": 1986362.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 124.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02764524333178997, + "kl": 0.0009856869583018124, + "learning_rate": 1.0883333333333332e-06, + "loss": 0.0001, + "num_tokens": 1986578.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 124.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1168983206152916, + "kl": 0.018185105174779892, + "learning_rate": 1.088e-06, + "loss": 0.001, + "num_tokens": 1986864.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 124.77777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0319888591766357, + "kl": 0.03988751722499728, + "learning_rate": 1.0876666666666668e-06, + "loss": 0.038, + "num_tokens": 1987134.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 6738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 124.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035117071121931076, + "kl": 0.0029737174045294523, + "learning_rate": 1.0873333333333334e-06, + "loss": 0.0002, + "num_tokens": 1987406.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 124.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08694670349359512, + "kl": 0.032046109437942505, + "learning_rate": 1.0870000000000001e-06, + "loss": 0.0015, + "num_tokens": 1987761.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 124.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07139554619789124, + "kl": 0.0026694713160395622, + "learning_rate": 1.0866666666666667e-06, + "loss": 0.0001, + "num_tokens": 1988057.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 124.85185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3275354504585266, + "kl": 0.29027828946709633, + "learning_rate": 1.0863333333333333e-06, + "loss": -0.0073, + "num_tokens": 1988427.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 6742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 38.5, + "completions/mean_terminated_length": 38.5, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 124.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.35377341508865356, + "kl": 0.11048712581396103, + "learning_rate": 1.086e-06, + "loss": 0.0055, + "num_tokens": 1988809.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 124.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03870866447687149, + "kl": 0.003643101081252098, + "learning_rate": 1.0856666666666666e-06, + "loss": 0.0002, + "num_tokens": 1989101.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 124.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02676636353135109, + "kl": 0.00027589499950408936, + "learning_rate": 1.0853333333333332e-06, + "loss": 0.0, + "num_tokens": 1989313.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 124.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00675484212115407, + "kl": 0.0009605585946701467, + "learning_rate": 1.0850000000000002e-06, + "loss": 0.0, + "num_tokens": 1989533.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 124.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035259027034044266, + "kl": 0.0031496757874265313, + "learning_rate": 1.0846666666666668e-06, + "loss": 0.0002, + "num_tokens": 1989847.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 124.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014846750535070896, + "kl": 0.013692875858396292, + "learning_rate": 1.0843333333333333e-06, + "loss": 0.0007, + "num_tokens": 1990107.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 124.98148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.411559104919434, + "kl": 0.199774457141757, + "learning_rate": 1.0840000000000001e-06, + "loss": 0.0114, + "num_tokens": 1990409.0, + "reward": 3.125, + "reward_std": 1.75, + "rewards/reward_combined/mean": 3.125, + "rewards/reward_combined/std": 1.75, + "step": 6749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 125.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10730460286140442, + "kl": 0.012488137930631638, + "learning_rate": 1.0836666666666667e-06, + "loss": 0.0006, + "num_tokens": 1990669.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 125.01851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.871403217315674, + "kl": 0.3770003356039524, + "learning_rate": 1.0833333333333333e-06, + "loss": -0.124, + "num_tokens": 1990992.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 6751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 125.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022102462127804756, + "kl": 0.0010592732578516006, + "learning_rate": 1.083e-06, + "loss": 0.0001, + "num_tokens": 1991304.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 125.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19147978723049164, + "kl": 0.01298466557636857, + "learning_rate": 1.0826666666666666e-06, + "loss": 0.0006, + "num_tokens": 1991558.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 125.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05072171613574028, + "kl": 0.01280858926475048, + "learning_rate": 1.0823333333333332e-06, + "loss": 0.0006, + "num_tokens": 1991883.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 125.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022947359830141068, + "kl": 0.001926311815623194, + "learning_rate": 1.0820000000000002e-06, + "loss": 0.0001, + "num_tokens": 1992143.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 125.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04130426421761513, + "kl": 0.008857755921781063, + "learning_rate": 1.0816666666666668e-06, + "loss": 0.0004, + "num_tokens": 1992409.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 125.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03974505886435509, + "kl": 0.004520110785961151, + "learning_rate": 1.0813333333333333e-06, + "loss": 0.0002, + "num_tokens": 1992625.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 125.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011156612075865269, + "kl": 0.00018146783258998767, + "learning_rate": 1.0810000000000001e-06, + "loss": 0.0, + "num_tokens": 1992895.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 28.5, + "completions/mean_terminated_length": 28.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 125.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.41733357310295105, + "kl": 0.12040858715772629, + "learning_rate": 1.0806666666666667e-06, + "loss": 0.0057, + "num_tokens": 1993245.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 125.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08265991508960724, + "kl": 0.012211547465994954, + "learning_rate": 1.0803333333333333e-06, + "loss": 0.0006, + "num_tokens": 1993578.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 33.5, + "completions/mean_terminated_length": 33.5, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 125.20370370370371, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.374009609222412, + "kl": 0.06599153392016888, + "learning_rate": 1.08e-06, + "loss": -0.0183, + "num_tokens": 1993936.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 6761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 125.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020630493760108948, + "kl": 0.006491988431662321, + "learning_rate": 1.0796666666666666e-06, + "loss": 0.0003, + "num_tokens": 1994234.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 125.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058466386049985886, + "kl": 0.01436679670587182, + "learning_rate": 1.0793333333333332e-06, + "loss": 0.0008, + "num_tokens": 1994508.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 125.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022439507767558098, + "kl": 0.03939470276236534, + "learning_rate": 1.0790000000000002e-06, + "loss": 0.002, + "num_tokens": 1994913.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 6764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 125.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05073806270956993, + "kl": 0.028877712786197662, + "learning_rate": 1.0786666666666667e-06, + "loss": 0.0014, + "num_tokens": 1995268.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 125.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0413748063147068, + "kl": 0.000565357506275177, + "learning_rate": 1.0783333333333333e-06, + "loss": 0.0, + "num_tokens": 1995481.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 125.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03212542086839676, + "kl": 0.0072274720296263695, + "learning_rate": 1.078e-06, + "loss": 0.0004, + "num_tokens": 1995818.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 125.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07369399815797806, + "kl": 0.0012552738189697266, + "learning_rate": 1.0776666666666667e-06, + "loss": 0.0001, + "num_tokens": 1996038.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 125.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18399615585803986, + "kl": 0.17982543259859085, + "learning_rate": 1.0773333333333332e-06, + "loss": 0.009, + "num_tokens": 1996347.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 125.37037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025085950270295143, + "kl": 0.004822854418307543, + "learning_rate": 1.077e-06, + "loss": 0.0002, + "num_tokens": 1996638.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 125.38888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.9007527828216553, + "kl": 0.07894202030729502, + "learning_rate": 1.0766666666666666e-06, + "loss": 0.0383, + "num_tokens": 1996924.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 6771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 125.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08631592243909836, + "kl": 0.03764430247247219, + "learning_rate": 1.0763333333333334e-06, + "loss": 0.0019, + "num_tokens": 1997222.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 125.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006870661396533251, + "kl": 0.0009814202785491943, + "learning_rate": 1.0760000000000002e-06, + "loss": 0.0, + "num_tokens": 1997442.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 125.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06275569647550583, + "kl": 0.043732261285185814, + "learning_rate": 1.0756666666666667e-06, + "loss": 0.0022, + "num_tokens": 1997746.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 125.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041031163185834885, + "kl": 0.0018593408167362213, + "learning_rate": 1.0753333333333333e-06, + "loss": 0.0001, + "num_tokens": 1998006.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 24.75, + "completions/mean_terminated_length": 24.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 125.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04532339423894882, + "kl": 0.009565880056470633, + "learning_rate": 1.075e-06, + "loss": 0.0004, + "num_tokens": 1998325.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 125.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11307155340909958, + "kl": 0.01451186928898096, + "learning_rate": 1.0746666666666667e-06, + "loss": 0.0007, + "num_tokens": 1998631.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 125.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016273561865091324, + "kl": 0.00021418631877168082, + "learning_rate": 1.0743333333333334e-06, + "loss": 0.0, + "num_tokens": 1998887.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 125.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008092403295449913, + "kl": 2.7239322662353516e-05, + "learning_rate": 1.074e-06, + "loss": 0.0, + "num_tokens": 1999107.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 6779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 32.0, + "completions/mean_terminated_length": 32.0, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 125.55555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03855562210083, + "kl": 0.26775551214814186, + "learning_rate": 1.0736666666666666e-06, + "loss": -0.1546, + "num_tokens": 1999451.0, + "reward": 5.75, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 2.598076105117798, + "step": 6780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 125.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007580664823763072, + "kl": 0.003774188458919525, + "learning_rate": 1.0733333333333334e-06, + "loss": 0.0002, + "num_tokens": 1999687.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 125.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4763793349266052, + "kl": 0.058355243410915136, + "learning_rate": 1.0730000000000001e-06, + "loss": 0.0032, + "num_tokens": 1999988.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 125.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1053260862827301, + "kl": 0.0136321063619107, + "learning_rate": 1.0726666666666667e-06, + "loss": 0.0007, + "num_tokens": 2000261.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 125.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04106324538588524, + "kl": 0.0023971308255568147, + "learning_rate": 1.0723333333333333e-06, + "loss": 0.0001, + "num_tokens": 2000557.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 125.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018377000465989113, + "kl": 0.006600759224966168, + "learning_rate": 1.072e-06, + "loss": 0.0003, + "num_tokens": 2000839.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 125.66666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07627657055854797, + "kl": 0.009154963190667331, + "learning_rate": 1.0716666666666666e-06, + "loss": 0.0004, + "num_tokens": 2001157.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 125.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007892654277384281, + "kl": 0.2675549238920212, + "learning_rate": 1.0713333333333334e-06, + "loss": 0.0134, + "num_tokens": 2001461.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 125.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0027956257108598948, + "kl": 0.00029599841218441725, + "learning_rate": 1.071e-06, + "loss": 0.0, + "num_tokens": 2001723.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 125.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01695258729159832, + "kl": 0.0005003288388252258, + "learning_rate": 1.0706666666666666e-06, + "loss": 0.0, + "num_tokens": 2001933.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 125.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05122031271457672, + "kl": 0.01109203090891242, + "learning_rate": 1.0703333333333333e-06, + "loss": 0.0005, + "num_tokens": 2002264.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 125.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055741142481565475, + "kl": 0.004283519694581628, + "learning_rate": 1.0700000000000001e-06, + "loss": 0.0002, + "num_tokens": 2002555.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 125.77777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.691385269165039, + "kl": 0.03416193334851414, + "learning_rate": 1.0696666666666667e-06, + "loss": 0.0071, + "num_tokens": 2002804.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 6792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 125.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047999780625104904, + "kl": 0.005202792584896088, + "learning_rate": 1.0693333333333335e-06, + "loss": 0.0003, + "num_tokens": 2003074.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 125.81481481481481, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.3485212326049805, + "kl": 0.0012153839052189142, + "learning_rate": 1.069e-06, + "loss": -0.0263, + "num_tokens": 2003388.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 6794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 37.75, + "completions/mean_terminated_length": 37.75, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 125.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16370108723640442, + "kl": 0.0857635922729969, + "learning_rate": 1.0686666666666666e-06, + "loss": 0.0043, + "num_tokens": 2003767.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 125.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07150498032569885, + "kl": 0.0024751177115831524, + "learning_rate": 1.0683333333333334e-06, + "loss": 0.0001, + "num_tokens": 2004041.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 125.87037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3654537200927734, + "kl": 0.02511216001585126, + "learning_rate": 1.068e-06, + "loss": 0.1187, + "num_tokens": 2004374.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 6797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 125.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08819403499364853, + "kl": 0.011112306732684374, + "learning_rate": 1.0676666666666666e-06, + "loss": 0.0006, + "num_tokens": 2004648.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 125.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13529539108276367, + "kl": 0.1144271045923233, + "learning_rate": 1.0673333333333333e-06, + "loss": 0.0057, + "num_tokens": 2005020.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.75, + "completions/mean_terminated_length": 5.75, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 125.92592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.918779373168945, + "kl": 0.2785526819061488, + "learning_rate": 1.0670000000000001e-06, + "loss": -0.1962, + "num_tokens": 2005251.0, + "reward": 3.125, + "reward_std": 1.75, + "rewards/reward_combined/mean": 3.125, + "rewards/reward_combined/std": 1.75, + "step": 6800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 125.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016369061544537544, + "kl": 0.01342594949528575, + "learning_rate": 1.0666666666666667e-06, + "loss": 0.0007, + "num_tokens": 2005511.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 125.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08869292587041855, + "kl": 0.028977181762456894, + "learning_rate": 1.0663333333333335e-06, + "loss": 0.0016, + "num_tokens": 2005800.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 125.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058695025742053986, + "kl": 0.025796583853662014, + "learning_rate": 1.066e-06, + "loss": 0.0013, + "num_tokens": 2006074.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 126.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017242413014173508, + "kl": 0.0017141809221357107, + "learning_rate": 1.0656666666666666e-06, + "loss": 0.0001, + "num_tokens": 2006356.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 126.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004666907712817192, + "kl": 0.00030143558979034424, + "learning_rate": 1.0653333333333334e-06, + "loss": 0.0, + "num_tokens": 2006668.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 126.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00892165768891573, + "kl": 0.008973983582109213, + "learning_rate": 1.065e-06, + "loss": 0.0004, + "num_tokens": 2006940.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 126.05555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01810777559876442, + "kl": 0.001970936224097386, + "learning_rate": 1.0646666666666665e-06, + "loss": 0.0001, + "num_tokens": 2007210.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 126.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027897275984287262, + "kl": 0.0008604814356658608, + "learning_rate": 1.0643333333333335e-06, + "loss": 0.0, + "num_tokens": 2007532.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 126.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013669824693351984, + "kl": 5.65648078918457e-05, + "learning_rate": 1.064e-06, + "loss": 0.0, + "num_tokens": 2007752.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 6809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 126.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03997933119535446, + "kl": 0.0038047805428504944, + "learning_rate": 1.0636666666666667e-06, + "loss": 0.0002, + "num_tokens": 2007968.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 126.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007181710097938776, + "kl": 0.0014414922916330397, + "learning_rate": 1.0633333333333335e-06, + "loss": 0.0001, + "num_tokens": 2008245.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 126.14814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8784403800964355, + "kl": 0.16962391138076782, + "learning_rate": 1.063e-06, + "loss": 0.022, + "num_tokens": 2008555.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 6812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 126.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008212026208639145, + "kl": 0.0004836122097913176, + "learning_rate": 1.0626666666666666e-06, + "loss": 0.0, + "num_tokens": 2008771.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 126.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07865607738494873, + "kl": 0.013101758435368538, + "learning_rate": 1.0623333333333334e-06, + "loss": 0.0007, + "num_tokens": 2009083.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 126.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04477753862738609, + "kl": 0.041311923414468765, + "learning_rate": 1.062e-06, + "loss": 0.0021, + "num_tokens": 2009487.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 6815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 126.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02466147392988205, + "kl": 0.0027008940232917666, + "learning_rate": 1.0616666666666665e-06, + "loss": 0.0001, + "num_tokens": 2009775.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 126.24074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09314874559640884, + "kl": 0.025913180783391, + "learning_rate": 1.0613333333333335e-06, + "loss": 0.0013, + "num_tokens": 2010089.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 126.25925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06415952742099762, + "kl": 0.010469916742295027, + "learning_rate": 1.061e-06, + "loss": 0.0005, + "num_tokens": 2010371.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 126.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007136716158129275, + "kl": 0.0037858039140701294, + "learning_rate": 1.0606666666666667e-06, + "loss": 0.0002, + "num_tokens": 2010607.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 126.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1873864233493805, + "kl": 0.030180448666214943, + "learning_rate": 1.0603333333333334e-06, + "loss": 0.0015, + "num_tokens": 2010876.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 126.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10978024452924728, + "kl": 0.020875709131360054, + "learning_rate": 1.06e-06, + "loss": 0.001, + "num_tokens": 2011173.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 26.25, + "completions/mean_terminated_length": 26.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 126.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12644976377487183, + "kl": 0.03895176900550723, + "learning_rate": 1.0596666666666666e-06, + "loss": 0.0019, + "num_tokens": 2011498.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 126.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5371469855308533, + "kl": 0.062394075095653534, + "learning_rate": 1.0593333333333334e-06, + "loss": 0.0035, + "num_tokens": 2011801.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 26.25, + "completions/mean_terminated_length": 26.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 126.37037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6179850101470947, + "kl": 0.08570412918925285, + "learning_rate": 1.059e-06, + "loss": 0.0798, + "num_tokens": 2012142.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 6824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 126.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002032228047028184, + "kl": 3.768404167203698e-05, + "learning_rate": 1.0586666666666665e-06, + "loss": 0.0, + "num_tokens": 2012412.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 126.4074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6470732092857361, + "kl": 0.0667794025503099, + "learning_rate": 1.0583333333333335e-06, + "loss": 0.0031, + "num_tokens": 2012647.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 126.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0456935316324234, + "kl": 0.025839708745479584, + "learning_rate": 1.058e-06, + "loss": 0.0012, + "num_tokens": 2012973.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 126.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03953197970986366, + "kl": 0.006467314786277711, + "learning_rate": 1.0576666666666666e-06, + "loss": 0.0003, + "num_tokens": 2013282.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 126.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0701349601149559, + "kl": 0.033556840382516384, + "learning_rate": 1.0573333333333334e-06, + "loss": 0.0017, + "num_tokens": 2013554.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 126.48148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015966404229402542, + "kl": 0.002121111494489014, + "learning_rate": 1.057e-06, + "loss": 0.0001, + "num_tokens": 2013838.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 126.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028778649866580963, + "kl": 0.0002717167080845684, + "learning_rate": 1.0566666666666666e-06, + "loss": 0.0, + "num_tokens": 2014051.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 126.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03152531385421753, + "kl": 0.005723806796595454, + "learning_rate": 1.0563333333333334e-06, + "loss": 0.0003, + "num_tokens": 2014378.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 126.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06886554509401321, + "kl": 0.003276680188719183, + "learning_rate": 1.056e-06, + "loss": 0.0002, + "num_tokens": 2014621.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 126.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8294137120246887, + "kl": 0.17252267152071, + "learning_rate": 1.0556666666666667e-06, + "loss": 0.0086, + "num_tokens": 2014993.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 126.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007763893809169531, + "kl": 0.2675829231739044, + "learning_rate": 1.0553333333333335e-06, + "loss": 0.0134, + "num_tokens": 2015297.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 126.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027351368218660355, + "kl": 0.01299174246378243, + "learning_rate": 1.055e-06, + "loss": 0.0007, + "num_tokens": 2015571.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 126.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024424538016319275, + "kl": 0.005048321094363928, + "learning_rate": 1.0546666666666666e-06, + "loss": 0.0002, + "num_tokens": 2015829.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6837 + }, + { + "clip_ratio/high_max": 0.012820512987673283, + "clip_ratio/high_mean": 0.012820512987673283, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.012820512987673283, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 126.62962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.530215263366699, + "kl": 0.07901394739747047, + "learning_rate": 1.0543333333333334e-06, + "loss": 0.0404, + "num_tokens": 2016155.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 6838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 126.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02266966551542282, + "kl": 0.003983311471529305, + "learning_rate": 1.054e-06, + "loss": 0.0002, + "num_tokens": 2016415.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 36.5, + "completions/mean_terminated_length": 36.5, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 126.66666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.27441143989563, + "kl": 0.1077931597828865, + "learning_rate": 1.0536666666666668e-06, + "loss": 0.0323, + "num_tokens": 2016789.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 6840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 126.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10084228962659836, + "kl": 0.00857362465467304, + "learning_rate": 1.0533333333333333e-06, + "loss": 0.0004, + "num_tokens": 2017110.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 126.70370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15140412747859955, + "kl": 0.025718985125422478, + "learning_rate": 1.053e-06, + "loss": 0.0012, + "num_tokens": 2017448.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 126.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013510082848370075, + "kl": 0.0009592053538654, + "learning_rate": 1.0526666666666667e-06, + "loss": 0.0, + "num_tokens": 2017744.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 45.0, + "completions/max_terminated_length": 45.0, + "completions/mean_length": 36.25, + "completions/mean_terminated_length": 36.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 126.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14007802307605743, + "kl": 0.04836008697748184, + "learning_rate": 1.0523333333333335e-06, + "loss": 0.0026, + "num_tokens": 2018113.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 126.75925925925925, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.018435478210449, + "kl": 0.05536913452669978, + "learning_rate": 1.052e-06, + "loss": 0.0598, + "num_tokens": 2018442.0, + "reward": 5.75, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 4.5, + "step": 6845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 126.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06786739081144333, + "kl": 0.03475327789783478, + "learning_rate": 1.0516666666666666e-06, + "loss": 0.0017, + "num_tokens": 2018781.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.75, + "completions/mean_terminated_length": 10.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 126.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08906006813049316, + "kl": 0.007779575273161754, + "learning_rate": 1.0513333333333334e-06, + "loss": 0.0004, + "num_tokens": 2019052.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 126.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014465812593698502, + "kl": 0.004050555871799588, + "learning_rate": 1.051e-06, + "loss": 0.0002, + "num_tokens": 2019343.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 126.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006947892718017101, + "kl": 0.0010696232784539461, + "learning_rate": 1.0506666666666668e-06, + "loss": 0.0001, + "num_tokens": 2019563.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 126.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02097518928349018, + "kl": 0.011328531429171562, + "learning_rate": 1.0503333333333333e-06, + "loss": 0.0006, + "num_tokens": 2019824.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 38.75, + "completions/mean_terminated_length": 38.75, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 126.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07189391553401947, + "kl": 0.04991878941655159, + "learning_rate": 1.05e-06, + "loss": 0.0025, + "num_tokens": 2020195.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 126.88888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.209402561187744, + "kl": 0.06212242541369051, + "learning_rate": 1.0496666666666667e-06, + "loss": 0.0056, + "num_tokens": 2020455.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 6852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 126.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04531474411487579, + "kl": 0.0009812377102207392, + "learning_rate": 1.0493333333333335e-06, + "loss": 0.0001, + "num_tokens": 2020712.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 126.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13391625881195068, + "kl": 0.04770008102059364, + "learning_rate": 1.049e-06, + "loss": 0.0024, + "num_tokens": 2021003.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 126.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12648507952690125, + "kl": 0.011427598306909204, + "learning_rate": 1.0486666666666668e-06, + "loss": 0.0006, + "num_tokens": 2021281.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 126.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0040601445361971855, + "kl": 0.0003002174198627472, + "learning_rate": 1.0483333333333334e-06, + "loss": 0.0, + "num_tokens": 2021541.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 126.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06571891903877258, + "kl": 0.022734422236680984, + "learning_rate": 1.048e-06, + "loss": 0.0011, + "num_tokens": 2021844.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 127.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024485278874635696, + "kl": 0.0006596893072128296, + "learning_rate": 1.0476666666666667e-06, + "loss": 0.0, + "num_tokens": 2022052.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 127.01851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030234840232878923, + "kl": 9.119510195887415e-06, + "learning_rate": 1.0473333333333333e-06, + "loss": 0.0, + "num_tokens": 2022324.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 127.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00649535097181797, + "kl": 0.0005749553674831986, + "learning_rate": 1.0469999999999999e-06, + "loss": 0.0, + "num_tokens": 2022584.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 35.75, + "completions/mean_terminated_length": 35.75, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 127.05555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8829007148742676, + "kl": 0.2540580630302429, + "learning_rate": 1.0466666666666667e-06, + "loss": -0.0197, + "num_tokens": 2022955.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 6861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 127.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061333753168582916, + "kl": 0.005895009380765259, + "learning_rate": 1.0463333333333335e-06, + "loss": 0.0003, + "num_tokens": 2023259.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 127.0925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6130591034889221, + "kl": 0.06673680990934372, + "learning_rate": 1.046e-06, + "loss": 0.0036, + "num_tokens": 2023543.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 127.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013818405568599701, + "kl": 0.00047837250167503953, + "learning_rate": 1.0456666666666668e-06, + "loss": 0.0, + "num_tokens": 2023786.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 127.12962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016355376690626144, + "kl": 0.0022178636281751096, + "learning_rate": 1.0453333333333334e-06, + "loss": 0.0001, + "num_tokens": 2024070.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 35.0, + "completions/mean_terminated_length": 35.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 127.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7307298183441162, + "kl": 0.09066342934966087, + "learning_rate": 1.045e-06, + "loss": 0.0059, + "num_tokens": 2024430.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 127.16666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0052855354733765125, + "kl": 6.3976644014474e-05, + "learning_rate": 1.0446666666666667e-06, + "loss": 0.0, + "num_tokens": 2024686.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 127.18518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014830618165433407, + "kl": 0.0012570849794428796, + "learning_rate": 1.0443333333333333e-06, + "loss": 0.0001, + "num_tokens": 2024956.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 127.20370370370371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04651113599538803, + "kl": 0.006027818424627185, + "learning_rate": 1.0439999999999999e-06, + "loss": 0.0003, + "num_tokens": 2025285.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 36.25, + "completions/mean_terminated_length": 36.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 127.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11370302736759186, + "kl": 0.04116308130323887, + "learning_rate": 1.0436666666666669e-06, + "loss": 0.0021, + "num_tokens": 2025654.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 127.24074074074075, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6144707202911377, + "kl": 0.018034445587545633, + "learning_rate": 1.0433333333333334e-06, + "loss": 0.1428, + "num_tokens": 2025942.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 6871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 127.25925925925925, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9137215614318848, + "kl": 0.02229921519756317, + "learning_rate": 1.043e-06, + "loss": -0.0299, + "num_tokens": 2026232.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 6872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 127.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033870477229356766, + "kl": 0.002436707552988082, + "learning_rate": 1.0426666666666668e-06, + "loss": 0.0001, + "num_tokens": 2026504.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 127.29629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02643694542348385, + "kl": 0.0031824863981455564, + "learning_rate": 1.0423333333333334e-06, + "loss": 0.0002, + "num_tokens": 2026794.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 127.31481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018859701231122017, + "kl": 0.00046088120143394917, + "learning_rate": 1.042e-06, + "loss": 0.0, + "num_tokens": 2027072.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 127.33333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03923534229397774, + "kl": 0.0037227485445328057, + "learning_rate": 1.0416666666666667e-06, + "loss": 0.0002, + "num_tokens": 2027402.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 127.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02452106401324272, + "kl": 0.04445505887269974, + "learning_rate": 1.0413333333333333e-06, + "loss": 0.0022, + "num_tokens": 2027806.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 6877 + }, + { + "clip_ratio/high_max": 0.012820512987673283, + "clip_ratio/high_mean": 0.012820512987673283, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.012820512987673283, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 127.37037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.3960442543029785, + "kl": 0.13830716768279672, + "learning_rate": 1.0409999999999999e-06, + "loss": 0.0445, + "num_tokens": 2028134.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 6878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 127.38888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2857022285461426, + "kl": 0.11537757702171803, + "learning_rate": 1.0406666666666669e-06, + "loss": -0.0416, + "num_tokens": 2028443.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 6879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 34.0, + "completions/mean_terminated_length": 34.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 127.4074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4905614852905273, + "kl": 0.07780079916119576, + "learning_rate": 1.0403333333333334e-06, + "loss": 0.158, + "num_tokens": 2028795.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 6880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 127.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09077906608581543, + "kl": 0.014146331697702408, + "learning_rate": 1.04e-06, + "loss": 0.0007, + "num_tokens": 2029096.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 127.44444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01411760225892067, + "kl": 0.0014931396581232548, + "learning_rate": 1.0396666666666668e-06, + "loss": 0.0001, + "num_tokens": 2029392.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 127.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01290932483971119, + "kl": 0.0009142353956121951, + "learning_rate": 1.0393333333333333e-06, + "loss": 0.0, + "num_tokens": 2029611.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 30.75, + "completions/mean_terminated_length": 30.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 127.48148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9439899921417236, + "kl": 0.05025894194841385, + "learning_rate": 1.039e-06, + "loss": 0.1725, + "num_tokens": 2029986.0, + "reward": 5.875, + "reward_std": 3.25, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 3.25, + "step": 6884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 127.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09740295261144638, + "kl": 0.018897773697972298, + "learning_rate": 1.0386666666666667e-06, + "loss": 0.0009, + "num_tokens": 2030306.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 127.51851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022704126313328743, + "kl": 0.0007961168885231018, + "learning_rate": 1.0383333333333333e-06, + "loss": 0.0, + "num_tokens": 2030566.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 127.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004377822857350111, + "kl": 0.001944471150636673, + "learning_rate": 1.0379999999999998e-06, + "loss": 0.0001, + "num_tokens": 2030878.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 127.55555555555556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025142759084701538, + "kl": 0.0013110190629959106, + "learning_rate": 1.0376666666666668e-06, + "loss": 0.0001, + "num_tokens": 2031090.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 127.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031239144504070282, + "kl": 0.01201264327391982, + "learning_rate": 1.0373333333333334e-06, + "loss": 0.0006, + "num_tokens": 2031351.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 127.5925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.44546443223953247, + "kl": 0.047297073528170586, + "learning_rate": 1.037e-06, + "loss": 0.0024, + "num_tokens": 2031615.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 127.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03662622347474098, + "kl": 0.0003695487976074219, + "learning_rate": 1.0366666666666668e-06, + "loss": 0.0, + "num_tokens": 2031819.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 127.62962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03863541781902313, + "kl": 0.025793186388909817, + "learning_rate": 1.0363333333333333e-06, + "loss": 0.0014, + "num_tokens": 2032108.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 127.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05816757306456566, + "kl": 0.015358704142272472, + "learning_rate": 1.036e-06, + "loss": 0.0008, + "num_tokens": 2032438.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 28.25, + "completions/mean_terminated_length": 28.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 127.66666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.662898063659668, + "kl": 0.30820655077695847, + "learning_rate": 1.0356666666666667e-06, + "loss": 0.0362, + "num_tokens": 2032787.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 6894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 127.68518518518519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022715888917446136, + "kl": 0.005837082164362073, + "learning_rate": 1.0353333333333333e-06, + "loss": 0.0003, + "num_tokens": 2033045.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 127.70370370370371, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9034695625305176, + "kl": 0.17113797832280397, + "learning_rate": 1.035e-06, + "loss": 0.0946, + "num_tokens": 2033353.0, + "reward": 4.625, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 6896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 127.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04517418146133423, + "kl": 0.264210045337677, + "learning_rate": 1.0346666666666668e-06, + "loss": 0.0132, + "num_tokens": 2033658.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 127.74074074074075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04011456295847893, + "kl": 0.0033448264002799988, + "learning_rate": 1.0343333333333334e-06, + "loss": 0.0002, + "num_tokens": 2033874.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 127.75925925925925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24899545311927795, + "kl": 0.027900994289666414, + "learning_rate": 1.034e-06, + "loss": 0.0014, + "num_tokens": 2034150.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 127.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014461315236985683, + "kl": 0.09716923534870148, + "learning_rate": 1.0336666666666668e-06, + "loss": 0.0049, + "num_tokens": 2034522.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 127.79629629629629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020893394947052002, + "kl": 0.0010304323222953826, + "learning_rate": 1.0333333333333333e-06, + "loss": 0.0001, + "num_tokens": 2034848.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 127.81481481481481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04372726380825043, + "kl": 0.007625943282619119, + "learning_rate": 1.033e-06, + "loss": 0.0004, + "num_tokens": 2035130.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 24.25, + "completions/mean_terminated_length": 24.25, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 127.83333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06292850524187088, + "kl": 0.03100801259279251, + "learning_rate": 1.0326666666666667e-06, + "loss": 0.0016, + "num_tokens": 2035447.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 127.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010208632797002792, + "kl": 0.0005754321900894865, + "learning_rate": 1.0323333333333332e-06, + "loss": 0.0, + "num_tokens": 2035758.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 127.87037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006103401072323322, + "kl": 9.500980377197266e-05, + "learning_rate": 1.032e-06, + "loss": 0.0, + "num_tokens": 2035970.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 127.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.26111310720443726, + "kl": 0.04130120389163494, + "learning_rate": 1.0316666666666668e-06, + "loss": 0.002, + "num_tokens": 2036280.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 127.9074074074074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19271153211593628, + "kl": 0.011852469586301595, + "learning_rate": 1.0313333333333334e-06, + "loss": 0.0006, + "num_tokens": 2036514.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 127.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.519508027238771e-05, + "kl": 2.1904706954956055e-06, + "learning_rate": 1.031e-06, + "loss": 0.0, + "num_tokens": 2036734.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 6908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 127.94444444444444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033342402428388596, + "kl": 0.0024372367188334465, + "learning_rate": 1.0306666666666667e-06, + "loss": 0.0001, + "num_tokens": 2036994.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 127.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015936752781271935, + "kl": 0.1608145907521248, + "learning_rate": 1.0303333333333333e-06, + "loss": 0.008, + "num_tokens": 2037303.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 127.98148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007167708827182651, + "kl": 0.0037852823734283447, + "learning_rate": 1.03e-06, + "loss": 0.0002, + "num_tokens": 2037539.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 128.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045564018189907074, + "kl": 0.013573323376476765, + "learning_rate": 1.0296666666666667e-06, + "loss": 0.0008, + "num_tokens": 2037813.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 128.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029641034081578255, + "kl": 0.002852322009857744, + "learning_rate": 1.0293333333333332e-06, + "loss": 0.0001, + "num_tokens": 2038140.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 128.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010478866286575794, + "kl": 0.0006081965693738312, + "learning_rate": 1.029e-06, + "loss": 0.0, + "num_tokens": 2038463.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 128.05555555555554, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.3764009475708, + "kl": 0.005206409317906946, + "learning_rate": 1.0286666666666668e-06, + "loss": -0.1718, + "num_tokens": 2038702.0, + "reward": 3.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 3.875, + "rewards/reward_combined/std": 0.25, + "step": 6915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 128.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019619235768914223, + "kl": 0.0012550760293379426, + "learning_rate": 1.0283333333333334e-06, + "loss": 0.0001, + "num_tokens": 2038974.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 128.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03320958837866783, + "kl": 0.0025151907466351986, + "learning_rate": 1.0280000000000002e-06, + "loss": 0.0001, + "num_tokens": 2039234.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 36.75, + "completions/mean_terminated_length": 36.75, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 128.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07638654112815857, + "kl": 0.0627257376909256, + "learning_rate": 1.0276666666666667e-06, + "loss": 0.0031, + "num_tokens": 2039609.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 128.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03185605630278587, + "kl": 0.004255507723428309, + "learning_rate": 1.0273333333333333e-06, + "loss": 0.0002, + "num_tokens": 2039897.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 128.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.953604497539345e-06, + "kl": 2.332031726837158e-06, + "learning_rate": 1.027e-06, + "loss": 0.0, + "num_tokens": 2040117.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 6920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 128.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08747190237045288, + "kl": 0.007621487835422158, + "learning_rate": 1.0266666666666666e-06, + "loss": 0.0004, + "num_tokens": 2040446.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 128.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07447715848684311, + "kl": 0.010064984206110239, + "learning_rate": 1.0263333333333332e-06, + "loss": 0.0005, + "num_tokens": 2040738.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6922 + }, + { + "clip_ratio/high_max": 0.008196720853447914, + "clip_ratio/high_mean": 0.008196720853447914, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008196720853447914, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 128.2037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.604742050170898, + "kl": 0.05840044282376766, + "learning_rate": 1.026e-06, + "loss": 0.2223, + "num_tokens": 2041063.0, + "reward": 5.875, + "reward_std": 3.25, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 3.25, + "step": 6923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 128.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036337144672870636, + "kl": 0.015849258517846465, + "learning_rate": 1.0256666666666668e-06, + "loss": 0.0009, + "num_tokens": 2041357.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 128.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008893858641386032, + "kl": 0.2673604488372803, + "learning_rate": 1.0253333333333334e-06, + "loss": 0.0134, + "num_tokens": 2041661.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 128.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039111483842134476, + "kl": 0.008277757093310356, + "learning_rate": 1.0250000000000001e-06, + "loss": 0.0004, + "num_tokens": 2041941.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 128.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014161958359181881, + "kl": 0.00192840991076082, + "learning_rate": 1.0246666666666667e-06, + "loss": 0.0001, + "num_tokens": 2042223.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 128.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03009587526321411, + "kl": 0.005230151815339923, + "learning_rate": 1.0243333333333333e-06, + "loss": 0.0003, + "num_tokens": 2042491.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 128.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032548196613788605, + "kl": 0.010243687313050032, + "learning_rate": 1.024e-06, + "loss": 0.0005, + "num_tokens": 2042818.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.5, + "completions/mean_terminated_length": 5.5, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 128.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12394523620605469, + "kl": 0.005353704560548067, + "learning_rate": 1.0236666666666666e-06, + "loss": 0.0003, + "num_tokens": 2043040.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 128.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011096575297415257, + "kl": 0.0028420157614164054, + "learning_rate": 1.0233333333333332e-06, + "loss": 0.0001, + "num_tokens": 2043306.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 128.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05578472465276718, + "kl": 0.028132045175880194, + "learning_rate": 1.0230000000000002e-06, + "loss": 0.0014, + "num_tokens": 2043582.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 128.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05788668617606163, + "kl": 0.052555881440639496, + "learning_rate": 1.0226666666666668e-06, + "loss": 0.0026, + "num_tokens": 2043922.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 128.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04197635501623154, + "kl": 0.002831184887327254, + "learning_rate": 1.0223333333333333e-06, + "loss": 0.0001, + "num_tokens": 2044220.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6934 + }, + { + "clip_ratio/high_max": 0.009615384973585606, + "clip_ratio/high_mean": 0.009615384973585606, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009615384973585606, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 128.42592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.555518388748169, + "kl": 0.6037652678787708, + "learning_rate": 1.0220000000000001e-06, + "loss": -0.0275, + "num_tokens": 2044554.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 6935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 128.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01702209748327732, + "kl": 0.0002147436262021074, + "learning_rate": 1.0216666666666667e-06, + "loss": 0.0, + "num_tokens": 2044810.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 128.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007536497432738543, + "kl": 0.00377655029296875, + "learning_rate": 1.0213333333333333e-06, + "loss": 0.0002, + "num_tokens": 2045046.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 128.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005898133385926485, + "kl": 0.00126456783618778, + "learning_rate": 1.021e-06, + "loss": 0.0001, + "num_tokens": 2045326.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 128.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05638109892606735, + "kl": 0.008408102672547102, + "learning_rate": 1.0206666666666666e-06, + "loss": 0.0004, + "num_tokens": 2045625.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 40.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 40.5, + "completions/mean_terminated_length": 40.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 128.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4065365195274353, + "kl": 0.10554420202970505, + "learning_rate": 1.0203333333333332e-06, + "loss": 0.0058, + "num_tokens": 2046011.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 5.25, + "completions/mean_terminated_length": 5.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 128.53703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.84788990020752, + "kl": 0.02879244275391102, + "learning_rate": 1.0200000000000002e-06, + "loss": 0.1904, + "num_tokens": 2046232.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 6941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 128.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0033319215290248394, + "kl": 0.0002970360219478607, + "learning_rate": 1.0196666666666668e-06, + "loss": 0.0, + "num_tokens": 2046492.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 128.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013312243856489658, + "kl": 0.09712640941143036, + "learning_rate": 1.0193333333333333e-06, + "loss": 0.0049, + "num_tokens": 2046864.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 128.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21970827877521515, + "kl": 0.013636435483931564, + "learning_rate": 1.0190000000000001e-06, + "loss": 0.0007, + "num_tokens": 2047176.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 27.25, + "completions/mean_terminated_length": 27.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 128.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3400464057922363, + "kl": 0.12038914859294891, + "learning_rate": 1.0186666666666667e-06, + "loss": -0.0152, + "num_tokens": 2047505.0, + "reward": 2.0, + "reward_std": 2.4494898319244385, + "rewards/reward_combined/mean": 2.0, + "rewards/reward_combined/std": 2.4494898319244385, + "step": 6945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 128.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01786050945520401, + "kl": 0.0004708681663032621, + "learning_rate": 1.0183333333333333e-06, + "loss": 0.0, + "num_tokens": 2047785.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.75, + "completions/mean_terminated_length": 31.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 128.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03709577023983002, + "kl": 0.02674313634634018, + "learning_rate": 1.018e-06, + "loss": 0.0013, + "num_tokens": 2048192.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 6947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 128.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010438960045576096, + "kl": 0.008320785127580166, + "learning_rate": 1.0176666666666666e-06, + "loss": 0.0004, + "num_tokens": 2048464.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.0, + "completions/max_terminated_length": 46.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 128.6851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.782740592956543, + "kl": 0.052046431228518486, + "learning_rate": 1.0173333333333332e-06, + "loss": 0.3258, + "num_tokens": 2048790.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 6949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 128.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012623009271919727, + "kl": 0.1605926901102066, + "learning_rate": 1.0170000000000002e-06, + "loss": 0.008, + "num_tokens": 2049099.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 128.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029894687235355377, + "kl": 0.006722460733726621, + "learning_rate": 1.0166666666666667e-06, + "loss": 0.0003, + "num_tokens": 2049430.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 128.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08028239011764526, + "kl": 0.02459784783422947, + "learning_rate": 1.0163333333333333e-06, + "loss": 0.0013, + "num_tokens": 2049718.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 128.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01449799258261919, + "kl": 0.0005191815289435908, + "learning_rate": 1.016e-06, + "loss": 0.0, + "num_tokens": 2049980.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 128.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02134798839688301, + "kl": 0.0008262942137662321, + "learning_rate": 1.0156666666666667e-06, + "loss": 0.0, + "num_tokens": 2050196.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 128.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007016902789473534, + "kl": 7.8251462582557e-05, + "learning_rate": 1.0153333333333332e-06, + "loss": 0.0, + "num_tokens": 2050462.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 38.25, + "completions/mean_terminated_length": 38.25, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 128.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2811586558818817, + "kl": 0.10004860907793045, + "learning_rate": 1.015e-06, + "loss": 0.005, + "num_tokens": 2050831.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 128.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06033000349998474, + "kl": 0.0011277824669377878, + "learning_rate": 1.0146666666666666e-06, + "loss": 0.0001, + "num_tokens": 2051044.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 128.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029922988265752792, + "kl": 0.0030030515044927597, + "learning_rate": 1.0143333333333334e-06, + "loss": 0.0002, + "num_tokens": 2051358.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 128.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00816932413727045, + "kl": 0.0001157522201538086, + "learning_rate": 1.0140000000000002e-06, + "loss": 0.0, + "num_tokens": 2051566.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 128.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042841535061597824, + "kl": 0.0063300770707428455, + "learning_rate": 1.0136666666666667e-06, + "loss": 0.0003, + "num_tokens": 2051834.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 128.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13512448966503143, + "kl": 0.007924702949821949, + "learning_rate": 1.0133333333333333e-06, + "loss": 0.0004, + "num_tokens": 2052134.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 128.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09692475944757462, + "kl": 0.03999147564172745, + "learning_rate": 1.013e-06, + "loss": 0.002, + "num_tokens": 2052471.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 128.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01623937301337719, + "kl": 0.013393386267125607, + "learning_rate": 1.0126666666666667e-06, + "loss": 0.0007, + "num_tokens": 2052731.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 128.96296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.519444465637207, + "kl": 0.0775340348482132, + "learning_rate": 1.0123333333333334e-06, + "loss": 0.0781, + "num_tokens": 2053025.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 6964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 128.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0452214814722538, + "kl": 0.0024351441534236073, + "learning_rate": 1.012e-06, + "loss": 0.0001, + "num_tokens": 2053259.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 129.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1895883083343506, + "kl": 0.12014993000775576, + "learning_rate": 1.0116666666666666e-06, + "loss": 0.0594, + "num_tokens": 2053549.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 6966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 129.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006553079001605511, + "kl": 0.0038548040320165455, + "learning_rate": 1.0113333333333334e-06, + "loss": 0.0002, + "num_tokens": 2053807.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 129.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20902186632156372, + "kl": 0.02006150223314762, + "learning_rate": 1.0110000000000001e-06, + "loss": 0.001, + "num_tokens": 2054073.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 129.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03659868985414505, + "kl": 0.00426034489646554, + "learning_rate": 1.0106666666666667e-06, + "loss": 0.0002, + "num_tokens": 2054405.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 129.07407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.235028028488159, + "kl": 0.04834963008761406, + "learning_rate": 1.0103333333333333e-06, + "loss": -0.0299, + "num_tokens": 2054696.0, + "reward": 7.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.875, + "rewards/reward_combined/std": 0.25, + "step": 6970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 129.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01068859826773405, + "kl": 0.001036324305459857, + "learning_rate": 1.01e-06, + "loss": 0.0001, + "num_tokens": 2054964.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 129.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03159612789750099, + "kl": 0.0015633500879630446, + "learning_rate": 1.0096666666666666e-06, + "loss": 0.0001, + "num_tokens": 2055236.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 26.25, + "completions/mean_terminated_length": 26.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 129.12962962962962, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5784212350845337, + "kl": 0.114490807056427, + "learning_rate": 1.0093333333333334e-06, + "loss": 0.0171, + "num_tokens": 2055577.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 6973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 129.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03063986822962761, + "kl": 0.00534681836143136, + "learning_rate": 1.009e-06, + "loss": 0.0003, + "num_tokens": 2055835.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 129.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06779507547616959, + "kl": 0.0009523332118988037, + "learning_rate": 1.0086666666666666e-06, + "loss": 0.0, + "num_tokens": 2056047.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 129.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011507448740303516, + "kl": 0.00032563507556915283, + "learning_rate": 1.0083333333333333e-06, + "loss": 0.0, + "num_tokens": 2056257.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 26.75, + "completions/mean_terminated_length": 26.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 129.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08644238114356995, + "kl": 0.034969511441886425, + "learning_rate": 1.0080000000000001e-06, + "loss": 0.0016, + "num_tokens": 2056584.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 129.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05382801964879036, + "kl": 0.009691030019894242, + "learning_rate": 1.0076666666666667e-06, + "loss": 0.0005, + "num_tokens": 2056914.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 129.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11369314044713974, + "kl": 0.006775143556296825, + "learning_rate": 1.0073333333333335e-06, + "loss": 0.0003, + "num_tokens": 2057148.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 129.25925925925927, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.688161849975586, + "kl": 0.34048211574554443, + "learning_rate": 1.007e-06, + "loss": 0.0578, + "num_tokens": 2057462.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 6980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 129.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029017580673098564, + "kl": 0.0009145679650828242, + "learning_rate": 1.0066666666666666e-06, + "loss": 0.0, + "num_tokens": 2057746.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 129.2962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8432042598724365, + "kl": 0.050350496312603354, + "learning_rate": 1.0063333333333334e-06, + "loss": -0.0009, + "num_tokens": 2058034.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 6982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 129.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01573021523654461, + "kl": 0.09661111608147621, + "learning_rate": 1.006e-06, + "loss": 0.0048, + "num_tokens": 2058406.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 6983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 129.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00953968707472086, + "kl": 0.0014483824488706887, + "learning_rate": 1.0056666666666666e-06, + "loss": 0.0001, + "num_tokens": 2058688.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 129.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03229231387376785, + "kl": 0.00474539038259536, + "learning_rate": 1.0053333333333333e-06, + "loss": 0.0002, + "num_tokens": 2058978.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 129.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.6413832276593894e-05, + "kl": 2.1979212760925293e-06, + "learning_rate": 1.0050000000000001e-06, + "loss": 0.0, + "num_tokens": 2059198.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 6986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 129.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13440021872520447, + "kl": 0.03015802800655365, + "learning_rate": 1.0046666666666667e-06, + "loss": 0.0015, + "num_tokens": 2059492.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 129.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0033988712821155787, + "kl": 0.00016928050172282383, + "learning_rate": 1.0043333333333335e-06, + "loss": 0.0, + "num_tokens": 2059804.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 129.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015135381370782852, + "kl": 0.013556838035583496, + "learning_rate": 1.004e-06, + "loss": 0.0007, + "num_tokens": 2060064.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.5, + "completions/mean_terminated_length": 5.5, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 129.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12105981260538101, + "kl": 0.005264115141471848, + "learning_rate": 1.0036666666666666e-06, + "loss": 0.0003, + "num_tokens": 2060286.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 129.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4594961404800415, + "kl": 0.03914309912943281, + "learning_rate": 1.0033333333333334e-06, + "loss": 0.0021, + "num_tokens": 2060612.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 6991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 129.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11597444117069244, + "kl": 0.032896749675273895, + "learning_rate": 1.003e-06, + "loss": 0.0016, + "num_tokens": 2060912.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 129.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08201200515031815, + "kl": 0.018918459303677082, + "learning_rate": 1.0026666666666665e-06, + "loss": 0.0009, + "num_tokens": 2061230.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 37.25, + "completions/mean_terminated_length": 37.25, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 129.5185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.148848056793213, + "kl": 0.40280213207006454, + "learning_rate": 1.0023333333333335e-06, + "loss": -0.0136, + "num_tokens": 2061595.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 6994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 129.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.486007422208786, + "kl": 0.07580779306590557, + "learning_rate": 1.002e-06, + "loss": 0.0035, + "num_tokens": 2061915.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 129.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08523035049438477, + "kl": 0.0449199303984642, + "learning_rate": 1.0016666666666667e-06, + "loss": 0.0023, + "num_tokens": 2062186.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 129.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006495171692222357, + "kl": 0.0015818774700164795, + "learning_rate": 1.0013333333333335e-06, + "loss": 0.0001, + "num_tokens": 2062402.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 129.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04938627406954765, + "kl": 0.0037298735696822405, + "learning_rate": 1.001e-06, + "loss": 0.0002, + "num_tokens": 2062645.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 6998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 129.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022117979824543, + "kl": 0.0022824269253760576, + "learning_rate": 1.0006666666666666e-06, + "loss": 0.0001, + "num_tokens": 2062927.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 6999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 129.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06404024362564087, + "kl": 0.004563490976579487, + "learning_rate": 1.0003333333333334e-06, + "loss": 0.0002, + "num_tokens": 2063181.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 129.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021524703130126, + "kl": 0.005385205149650574, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 2063449.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 129.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011293207295238972, + "kl": 0.0011240161256864667, + "learning_rate": 9.996666666666665e-07, + "loss": 0.0001, + "num_tokens": 2063751.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 129.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00586632639169693, + "kl": 0.00010887086318689398, + "learning_rate": 9.993333333333335e-07, + "loss": 0.0, + "num_tokens": 2064007.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 31.5, + "completions/mean_terminated_length": 31.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 129.7037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9347095489501953, + "kl": 0.49228236079216003, + "learning_rate": 9.99e-07, + "loss": 0.0046, + "num_tokens": 2064361.0, + "reward": 5.5, + "reward_std": 2.6140644550323486, + "rewards/reward_combined/mean": 5.5, + "rewards/reward_combined/std": 2.6140644550323486, + "step": 7004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 129.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008292172569781542, + "kl": 0.003763720393180847, + "learning_rate": 9.986666666666667e-07, + "loss": 0.0002, + "num_tokens": 2064597.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 129.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07730316370725632, + "kl": 0.015745405107736588, + "learning_rate": 9.983333333333334e-07, + "loss": 0.0008, + "num_tokens": 2064902.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 129.75925925925927, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.363114833831787, + "kl": 0.027033142123400467, + "learning_rate": 9.98e-07, + "loss": -0.0831, + "num_tokens": 2065174.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 7007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 129.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007219251710921526, + "kl": 0.0020350394770503044, + "learning_rate": 9.976666666666666e-07, + "loss": 0.0001, + "num_tokens": 2065486.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 23.25, + "completions/mean_terminated_length": 23.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 129.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12257125228643417, + "kl": 0.009929151739925146, + "learning_rate": 9.973333333333334e-07, + "loss": 0.0005, + "num_tokens": 2065827.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 35.25, + "completions/mean_terminated_length": 35.25, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 129.8148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6105772256851196, + "kl": 0.09189913421869278, + "learning_rate": 9.97e-07, + "loss": -0.065, + "num_tokens": 2066192.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 7010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 129.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009716881439089775, + "kl": 0.2671699821949005, + "learning_rate": 9.966666666666665e-07, + "loss": 0.0134, + "num_tokens": 2066496.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 129.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020831193774938583, + "kl": 0.0019416631548665464, + "learning_rate": 9.963333333333335e-07, + "loss": 0.0001, + "num_tokens": 2066768.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 129.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10478778928518295, + "kl": 0.015824542846530676, + "learning_rate": 9.96e-07, + "loss": 0.0008, + "num_tokens": 2067101.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 30.0, + "completions/mean_terminated_length": 30.0, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 129.88888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1142385005950928, + "kl": 0.16468075662851334, + "learning_rate": 9.956666666666666e-07, + "loss": -0.0264, + "num_tokens": 2067501.0, + "reward": 2.375, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 1.25, + "step": 7014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 129.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02332492545247078, + "kl": 0.0010798722505569458, + "learning_rate": 9.953333333333334e-07, + "loss": 0.0001, + "num_tokens": 2067713.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 129.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012855621054768562, + "kl": 0.0004808790981769562, + "learning_rate": 9.95e-07, + "loss": 0.0, + "num_tokens": 2067973.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 129.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07349342852830887, + "kl": 0.017411372624337673, + "learning_rate": 9.946666666666666e-07, + "loss": 0.001, + "num_tokens": 2068255.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 129.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10274455696344376, + "kl": 0.024848349392414093, + "learning_rate": 9.943333333333334e-07, + "loss": 0.0012, + "num_tokens": 2068557.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 129.9814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3386154174804688, + "kl": 0.09217208810150623, + "learning_rate": 9.94e-07, + "loss": 0.1317, + "num_tokens": 2068907.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 7019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 130.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5648937225341797, + "kl": 0.07233530622033868, + "learning_rate": 9.936666666666667e-07, + "loss": 0.1104, + "num_tokens": 2069209.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 7020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 27.25, + "completions/mean_terminated_length": 27.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 130.0185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.461804986000061, + "kl": 0.12070186994969845, + "learning_rate": 9.933333333333335e-07, + "loss": 0.0869, + "num_tokens": 2069554.0, + "reward": 6.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 6.5, + "rewards/reward_combined/std": 2.0, + "step": 7021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 130.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027200795710086823, + "kl": 0.006897842977195978, + "learning_rate": 9.93e-07, + "loss": 0.0003, + "num_tokens": 2069886.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 130.05555555555554, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.096599102020264, + "kl": 0.05096434731967747, + "learning_rate": 9.926666666666666e-07, + "loss": 0.0858, + "num_tokens": 2070186.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 7023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 130.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009517742320895195, + "kl": 0.26722897589206696, + "learning_rate": 9.923333333333334e-07, + "loss": 0.0134, + "num_tokens": 2070490.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 130.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02489904686808586, + "kl": 0.0003945454955101013, + "learning_rate": 9.92e-07, + "loss": 0.0, + "num_tokens": 2070700.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 130.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02939853072166443, + "kl": 0.006774634122848511, + "learning_rate": 9.916666666666666e-07, + "loss": 0.0003, + "num_tokens": 2070972.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 130.12962962962962, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4942113757133484, + "kl": 0.2910715565085411, + "learning_rate": 9.913333333333333e-07, + "loss": -0.0058, + "num_tokens": 2071340.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 7027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 59.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 118.0, + "completions/max_terminated_length": 118.0, + "completions/mean_length": 59.75, + "completions/mean_terminated_length": 59.75, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 130.14814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.380537986755371, + "kl": 0.10103682242333889, + "learning_rate": 9.91e-07, + "loss": 0.2665, + "num_tokens": 2071795.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 7028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 130.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018369261175394058, + "kl": 0.0023810090497136116, + "learning_rate": 9.906666666666667e-07, + "loss": 0.0001, + "num_tokens": 2072107.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 130.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02092871069908142, + "kl": 0.0007555286138085648, + "learning_rate": 9.903333333333335e-07, + "loss": 0.0, + "num_tokens": 2072323.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 130.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03298458084464073, + "kl": 0.006786221172660589, + "learning_rate": 9.9e-07, + "loss": 0.0003, + "num_tokens": 2072646.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 130.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07828333228826523, + "kl": 0.007808617083355784, + "learning_rate": 9.896666666666666e-07, + "loss": 0.0004, + "num_tokens": 2072908.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 130.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10994374006986618, + "kl": 0.016406590584665537, + "learning_rate": 9.893333333333334e-07, + "loss": 0.0008, + "num_tokens": 2073233.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 130.25925925925927, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.831333160400391, + "kl": 0.04937233589589596, + "learning_rate": 9.89e-07, + "loss": 0.0972, + "num_tokens": 2073559.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 7034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 24.75, + "completions/mean_terminated_length": 24.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 130.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09697316586971283, + "kl": 0.024668416008353233, + "learning_rate": 9.886666666666668e-07, + "loss": 0.0012, + "num_tokens": 2073886.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 130.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07783915847539902, + "kl": 0.003793957643210888, + "learning_rate": 9.883333333333333e-07, + "loss": 0.0002, + "num_tokens": 2074120.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 26.25, + "completions/mean_terminated_length": 26.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 130.3148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.231584548950195, + "kl": 0.025572039652615786, + "learning_rate": 9.88e-07, + "loss": 0.2435, + "num_tokens": 2074445.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 7037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 130.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.1727960109710693, + "kl": 0.5397544535808265, + "learning_rate": 9.876666666666667e-07, + "loss": 0.0284, + "num_tokens": 2074706.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 130.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06005854532122612, + "kl": 0.04777650721371174, + "learning_rate": 9.873333333333335e-07, + "loss": 0.0024, + "num_tokens": 2075110.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 130.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014469306915998459, + "kl": 0.0014208531356416643, + "learning_rate": 9.87e-07, + "loss": 0.0001, + "num_tokens": 2075410.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 130.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03294166550040245, + "kl": 0.0030110597144812346, + "learning_rate": 9.866666666666668e-07, + "loss": 0.0002, + "num_tokens": 2075702.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 34.75, + "completions/mean_terminated_length": 34.75, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 130.40740740740742, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4369101524353027, + "kl": 0.12501885369420052, + "learning_rate": 9.863333333333334e-07, + "loss": -0.1031, + "num_tokens": 2076069.0, + "reward": 7.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.875, + "rewards/reward_combined/std": 0.25, + "step": 7042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 130.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027926573529839516, + "kl": 0.0006210476385604125, + "learning_rate": 9.86e-07, + "loss": 0.0, + "num_tokens": 2076325.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 130.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05098537355661392, + "kl": 0.005229807575233281, + "learning_rate": 9.856666666666667e-07, + "loss": 0.0003, + "num_tokens": 2076615.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 130.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024386560544371605, + "kl": 0.0015731000748928636, + "learning_rate": 9.853333333333333e-07, + "loss": 0.0001, + "num_tokens": 2076888.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 130.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021922960877418518, + "kl": 0.006068588700145483, + "learning_rate": 9.849999999999999e-07, + "loss": 0.0003, + "num_tokens": 2077156.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 130.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11238568276166916, + "kl": 0.03599953092634678, + "learning_rate": 9.846666666666667e-07, + "loss": 0.0018, + "num_tokens": 2077454.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 130.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06677719205617905, + "kl": 0.005988605320453644, + "learning_rate": 9.843333333333335e-07, + "loss": 0.0003, + "num_tokens": 2077726.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 130.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024617446586489677, + "kl": 0.002243560622446239, + "learning_rate": 9.84e-07, + "loss": 0.0001, + "num_tokens": 2078052.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 130.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010872730985283852, + "kl": 0.00036402890691533685, + "learning_rate": 9.836666666666668e-07, + "loss": 0.0, + "num_tokens": 2078366.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.75, + "completions/mean_terminated_length": 12.75, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 130.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11603320389986038, + "kl": 0.009916636859998107, + "learning_rate": 9.833333333333334e-07, + "loss": 0.0005, + "num_tokens": 2078629.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 130.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03702886402606964, + "kl": 0.011082913508289494, + "learning_rate": 9.83e-07, + "loss": 0.0006, + "num_tokens": 2078916.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 130.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.439342829864472e-05, + "kl": 3.2335519790649414e-06, + "learning_rate": 9.826666666666667e-07, + "loss": 0.0, + "num_tokens": 2079136.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 130.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000660731631796807, + "kl": 0.0012536580907180905, + "learning_rate": 9.823333333333333e-07, + "loss": 0.0001, + "num_tokens": 2079416.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 130.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1541651487350464, + "kl": 0.005077527370303869, + "learning_rate": 9.819999999999999e-07, + "loss": 0.0002, + "num_tokens": 2079629.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 130.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046823784708976746, + "kl": 0.009026986081153154, + "learning_rate": 9.816666666666669e-07, + "loss": 0.0005, + "num_tokens": 2079915.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 130.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01296068076044321, + "kl": 0.16126051545143127, + "learning_rate": 9.813333333333334e-07, + "loss": 0.0081, + "num_tokens": 2080224.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 130.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08984318375587463, + "kl": 0.009020114550366998, + "learning_rate": 9.81e-07, + "loss": 0.0005, + "num_tokens": 2080484.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 130.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009963327087461948, + "kl": 0.008633102290332317, + "learning_rate": 9.806666666666668e-07, + "loss": 0.0004, + "num_tokens": 2080756.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 130.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17390818893909454, + "kl": 0.019016915932297707, + "learning_rate": 9.803333333333334e-07, + "loss": 0.001, + "num_tokens": 2081032.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 33.25, + "completions/mean_terminated_length": 33.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 130.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050935182720422745, + "kl": 0.030692865140736103, + "learning_rate": 9.8e-07, + "loss": 0.0015, + "num_tokens": 2081389.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 130.77777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.701384544372559, + "kl": 0.06063641281798482, + "learning_rate": 9.796666666666667e-07, + "loss": -0.1025, + "num_tokens": 2081666.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 7062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 130.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056545305997133255, + "kl": 0.03181551210582256, + "learning_rate": 9.793333333333333e-07, + "loss": 0.0016, + "num_tokens": 2082012.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 130.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008005702402442694, + "kl": 0.0037714391946792603, + "learning_rate": 9.789999999999999e-07, + "loss": 0.0002, + "num_tokens": 2082248.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 130.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014514670707285404, + "kl": 0.0010657445527613163, + "learning_rate": 9.786666666666669e-07, + "loss": 0.0001, + "num_tokens": 2082526.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 130.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005094868130981922, + "kl": 0.00036312639713287354, + "learning_rate": 9.783333333333334e-07, + "loss": 0.0, + "num_tokens": 2082770.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 130.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005842653568834066, + "kl": 0.0036643360799644142, + "learning_rate": 9.78e-07, + "loss": 0.0002, + "num_tokens": 2083028.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 130.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020485663786530495, + "kl": 0.0008285753428936005, + "learning_rate": 9.776666666666668e-07, + "loss": 0.0, + "num_tokens": 2083288.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 130.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02006685361266136, + "kl": 0.0021281553199514747, + "learning_rate": 9.773333333333333e-07, + "loss": 0.0001, + "num_tokens": 2083570.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 27.0, + "completions/mean_terminated_length": 27.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 130.92592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.66013765335083, + "kl": 0.04869246482849121, + "learning_rate": 9.77e-07, + "loss": -0.232, + "num_tokens": 2083902.0, + "reward": 7.625, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.625, + "rewards/reward_combined/std": 0.25, + "step": 7070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 130.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01702677085995674, + "kl": 0.0008085608133114874, + "learning_rate": 9.766666666666667e-07, + "loss": 0.0, + "num_tokens": 2084121.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 5.75, + "completions/mean_terminated_length": 5.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 130.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12956717610359192, + "kl": 0.010188494343310595, + "learning_rate": 9.763333333333333e-07, + "loss": 0.0006, + "num_tokens": 2084344.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 130.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020360322669148445, + "kl": 0.0009623808145988733, + "learning_rate": 9.759999999999998e-07, + "loss": 0.0, + "num_tokens": 2084673.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 131.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07670935243368149, + "kl": 0.01936477469280362, + "learning_rate": 9.756666666666668e-07, + "loss": 0.001, + "num_tokens": 2084994.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 131.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058146726340055466, + "kl": 0.01534320879727602, + "learning_rate": 9.753333333333334e-07, + "loss": 0.0008, + "num_tokens": 2085296.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 131.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024496566504240036, + "kl": 0.001151248812675476, + "learning_rate": 9.75e-07, + "loss": 0.0001, + "num_tokens": 2085508.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 131.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008731354959309101, + "kl": 0.0004917159676551819, + "learning_rate": 9.746666666666668e-07, + "loss": 0.0, + "num_tokens": 2085768.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 2.5, + "completions/mean_terminated_length": 2.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 131.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03717421367764473, + "kl": 0.0006443336606025696, + "learning_rate": 9.743333333333333e-07, + "loss": 0.0, + "num_tokens": 2085974.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 131.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2457912415266037, + "kl": 0.12425431609153748, + "learning_rate": 9.74e-07, + "loss": 0.0062, + "num_tokens": 2086346.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 131.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03047761879861355, + "kl": 0.004860547371208668, + "learning_rate": 9.736666666666667e-07, + "loss": 0.0002, + "num_tokens": 2086630.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 131.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04770100116729736, + "kl": 0.006718369899317622, + "learning_rate": 9.733333333333333e-07, + "loss": 0.0003, + "num_tokens": 2086930.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 131.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021395940333604813, + "kl": 0.0018522377649787813, + "learning_rate": 9.73e-07, + "loss": 0.0001, + "num_tokens": 2087218.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 131.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05242915824055672, + "kl": 0.013319097459316254, + "learning_rate": 9.726666666666668e-07, + "loss": 0.0007, + "num_tokens": 2087488.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 131.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10964227467775345, + "kl": 0.008151817324687727, + "learning_rate": 9.723333333333334e-07, + "loss": 0.0004, + "num_tokens": 2087707.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 131.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014443033374845982, + "kl": 0.0035891212755814195, + "learning_rate": 9.72e-07, + "loss": 0.0002, + "num_tokens": 2087975.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 131.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043561384081840515, + "kl": 0.008046003174968064, + "learning_rate": 9.716666666666668e-07, + "loss": 0.0004, + "num_tokens": 2088302.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 131.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05766814574599266, + "kl": 0.05627436190843582, + "learning_rate": 9.713333333333333e-07, + "loss": 0.0028, + "num_tokens": 2088639.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 131.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.47513588774018e-05, + "kl": 2.600252628326416e-06, + "learning_rate": 9.709999999999999e-07, + "loss": 0.0, + "num_tokens": 2088859.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 131.27777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.464715480804443, + "kl": 0.06628647446632385, + "learning_rate": 9.706666666666667e-07, + "loss": 0.0071, + "num_tokens": 2089135.0, + "reward": 4.5, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 4.041451930999756, + "step": 7089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 131.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07437316328287125, + "kl": 0.021190166473388672, + "learning_rate": 9.703333333333332e-07, + "loss": 0.0012, + "num_tokens": 2089415.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 131.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004481513053178787, + "kl": 0.0003196708858013153, + "learning_rate": 9.7e-07, + "loss": 0.0, + "num_tokens": 2089659.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 131.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02414514496922493, + "kl": 0.001568454084917903, + "learning_rate": 9.696666666666668e-07, + "loss": 0.0001, + "num_tokens": 2089966.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 40.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 40.75, + "completions/mean_terminated_length": 40.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 131.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058970700949430466, + "kl": 0.009768346790224314, + "learning_rate": 9.693333333333334e-07, + "loss": 0.0006, + "num_tokens": 2090349.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 47.0, + "completions/max_terminated_length": 47.0, + "completions/mean_length": 36.75, + "completions/mean_terminated_length": 36.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 131.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07728622853755951, + "kl": 0.0364050418138504, + "learning_rate": 9.69e-07, + "loss": 0.0019, + "num_tokens": 2090720.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 131.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08415987342596054, + "kl": 0.006773005472496152, + "learning_rate": 9.686666666666667e-07, + "loss": 0.0003, + "num_tokens": 2090988.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 33.25, + "completions/mean_terminated_length": 33.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 131.40740740740742, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8247379064559937, + "kl": 0.09288447350263596, + "learning_rate": 9.683333333333333e-07, + "loss": -0.0913, + "num_tokens": 2091337.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 7096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 131.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017693351954221725, + "kl": 0.0019776462577283382, + "learning_rate": 9.68e-07, + "loss": 0.0001, + "num_tokens": 2091619.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 27.5, + "completions/mean_terminated_length": 27.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 131.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.263444185256958, + "kl": 0.05585545673966408, + "learning_rate": 9.676666666666667e-07, + "loss": 0.0028, + "num_tokens": 2091949.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 131.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.276368111371994, + "kl": 0.022382110357284546, + "learning_rate": 9.673333333333332e-07, + "loss": 0.0011, + "num_tokens": 2092165.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 131.4814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.272090911865234, + "kl": 0.016909361351281404, + "learning_rate": 9.67e-07, + "loss": 0.1955, + "num_tokens": 2092437.0, + "reward": 7.0, + "reward_std": 1.0, + "rewards/reward_combined/mean": 7.0, + "rewards/reward_combined/std": 1.0, + "step": 7100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 131.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.991854190826416, + "kl": 0.018948630429804325, + "learning_rate": 9.666666666666668e-07, + "loss": 0.2911, + "num_tokens": 2092722.0, + "reward": 5.875, + "reward_std": 3.25, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 3.25, + "step": 7101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 131.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.29906225204467773, + "kl": 0.04519226215779781, + "learning_rate": 9.663333333333334e-07, + "loss": 0.0024, + "num_tokens": 2092998.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 131.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038236718624830246, + "kl": 0.0036044390872120857, + "learning_rate": 9.660000000000002e-07, + "loss": 0.0002, + "num_tokens": 2093258.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 131.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05997953936457634, + "kl": 0.047408703714609146, + "learning_rate": 9.656666666666667e-07, + "loss": 0.0024, + "num_tokens": 2093662.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 131.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05586628243327141, + "kl": 0.016930708661675453, + "learning_rate": 9.653333333333333e-07, + "loss": 0.0008, + "num_tokens": 2093978.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 131.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030245967209339142, + "kl": 0.15784524381160736, + "learning_rate": 9.65e-07, + "loss": 0.0079, + "num_tokens": 2094289.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 131.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18933509290218353, + "kl": 0.018535910174250603, + "learning_rate": 9.646666666666666e-07, + "loss": 0.001, + "num_tokens": 2094563.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 131.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1340102106332779, + "kl": 0.016697907354682684, + "learning_rate": 9.643333333333332e-07, + "loss": 0.0008, + "num_tokens": 2094859.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 131.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04514975845813751, + "kl": 0.004168018000200391, + "learning_rate": 9.64e-07, + "loss": 0.0002, + "num_tokens": 2095190.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 131.66666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.527789831161499, + "kl": 0.9675229638814926, + "learning_rate": 9.636666666666668e-07, + "loss": 0.0478, + "num_tokens": 2095497.0, + "reward": 4.5, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 4.041451930999756, + "step": 7110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.25, + "completions/mean_terminated_length": 6.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 131.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07461269199848175, + "kl": 0.002892457414418459, + "learning_rate": 9.633333333333334e-07, + "loss": 0.0001, + "num_tokens": 2095730.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 131.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027584195137023926, + "kl": 0.00124273075198289, + "learning_rate": 9.630000000000001e-07, + "loss": 0.0001, + "num_tokens": 2096041.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 131.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0327063724398613, + "kl": 0.008052623365074396, + "learning_rate": 9.626666666666667e-07, + "loss": 0.0004, + "num_tokens": 2096329.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 131.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005709430668503046, + "kl": 0.0009871545480564237, + "learning_rate": 9.623333333333333e-07, + "loss": 0.0, + "num_tokens": 2096625.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 131.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013671758584678173, + "kl": 0.0005690186808351427, + "learning_rate": 9.62e-07, + "loss": 0.0, + "num_tokens": 2096948.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 131.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027838649228215218, + "kl": 0.0007231563213281333, + "learning_rate": 9.616666666666666e-07, + "loss": 0.0, + "num_tokens": 2097204.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 131.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10903138667345047, + "kl": 0.020756863988935947, + "learning_rate": 9.613333333333332e-07, + "loss": 0.001, + "num_tokens": 2097543.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 131.8148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.282342910766602, + "kl": 0.2681761170970276, + "learning_rate": 9.610000000000002e-07, + "loss": 0.0486, + "num_tokens": 2097819.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 7118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 131.83333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8388426303863525, + "kl": 0.8125451585510746, + "learning_rate": 9.606666666666668e-07, + "loss": 0.0439, + "num_tokens": 2098107.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 7119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 131.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08289333432912827, + "kl": 0.03515140898525715, + "learning_rate": 9.603333333333333e-07, + "loss": 0.0018, + "num_tokens": 2098462.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 131.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005263836123049259, + "kl": 0.00024227624089689925, + "learning_rate": 9.600000000000001e-07, + "loss": 0.0, + "num_tokens": 2098724.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 131.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05346520617604256, + "kl": 0.012155882082879543, + "learning_rate": 9.596666666666667e-07, + "loss": 0.0006, + "num_tokens": 2099045.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 131.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05276263877749443, + "kl": 0.02926110289990902, + "learning_rate": 9.593333333333333e-07, + "loss": 0.0015, + "num_tokens": 2099345.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 131.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007885195664130151, + "kl": 0.0037667304277420044, + "learning_rate": 9.59e-07, + "loss": 0.0002, + "num_tokens": 2099581.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 131.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11647625267505646, + "kl": 0.002949245972558856, + "learning_rate": 9.586666666666666e-07, + "loss": 0.0001, + "num_tokens": 2099853.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 131.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033407438546419144, + "kl": 0.00026413053274154663, + "learning_rate": 9.583333333333332e-07, + "loss": 0.0, + "num_tokens": 2100065.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 131.9814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6440527439117432, + "kl": 0.12046198267489672, + "learning_rate": 9.580000000000002e-07, + "loss": 0.006, + "num_tokens": 2100377.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 7127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 38.25, + "completions/mean_terminated_length": 38.25, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 132.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056929267942905426, + "kl": 0.05902410298585892, + "learning_rate": 9.576666666666668e-07, + "loss": 0.0029, + "num_tokens": 2100758.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 132.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22061622142791748, + "kl": 0.030477201100438833, + "learning_rate": 9.573333333333333e-07, + "loss": 0.0018, + "num_tokens": 2101030.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 132.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010728205554187298, + "kl": 0.007938293274492025, + "learning_rate": 9.570000000000001e-07, + "loss": 0.0004, + "num_tokens": 2101302.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 132.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01718173548579216, + "kl": 0.0008112609793897718, + "learning_rate": 9.566666666666667e-07, + "loss": 0.0, + "num_tokens": 2101592.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 132.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4237317740917206, + "kl": 0.06007047765888274, + "learning_rate": 9.563333333333333e-07, + "loss": 0.0032, + "num_tokens": 2101898.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 132.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018750177696347237, + "kl": 0.000255244696745649, + "learning_rate": 9.56e-07, + "loss": 0.0, + "num_tokens": 2102155.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 132.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13300053775310516, + "kl": 0.01826266571879387, + "learning_rate": 9.556666666666666e-07, + "loss": 0.0009, + "num_tokens": 2102417.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 132.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006708446424454451, + "kl": 0.0015731006860733032, + "learning_rate": 9.553333333333332e-07, + "loss": 0.0001, + "num_tokens": 2102633.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 132.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03312021493911743, + "kl": 0.0016663968563079834, + "learning_rate": 9.550000000000002e-07, + "loss": 0.0001, + "num_tokens": 2102876.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 36.5, + "completions/mean_terminated_length": 36.5, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 132.16666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.852743625640869, + "kl": 0.059035494923591614, + "learning_rate": 9.546666666666667e-07, + "loss": 0.0064, + "num_tokens": 2103250.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 7137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 132.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004768291022628546, + "kl": 0.0008493363857269287, + "learning_rate": 9.543333333333333e-07, + "loss": 0.0, + "num_tokens": 2103466.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 132.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0716002807021141, + "kl": 0.012565402314066887, + "learning_rate": 9.54e-07, + "loss": 0.0006, + "num_tokens": 2103794.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 132.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025367960333824158, + "kl": 0.05874118395149708, + "learning_rate": 9.536666666666667e-07, + "loss": 0.0029, + "num_tokens": 2104126.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 24.25, + "completions/mean_terminated_length": 24.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 132.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06814675033092499, + "kl": 0.01120592188090086, + "learning_rate": 9.533333333333333e-07, + "loss": 0.0006, + "num_tokens": 2104475.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 132.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05073690041899681, + "kl": 0.00727212755009532, + "learning_rate": 9.53e-07, + "loss": 0.0003, + "num_tokens": 2104775.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.5, + "completions/mean_terminated_length": 4.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 132.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1490279734134674, + "kl": 0.004201886593364179, + "learning_rate": 9.526666666666666e-07, + "loss": 0.0002, + "num_tokens": 2104989.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 132.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026240535080432892, + "kl": 0.15914025157690048, + "learning_rate": 9.523333333333333e-07, + "loss": 0.008, + "num_tokens": 2105299.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 26.75, + "completions/mean_terminated_length": 26.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 132.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0895161628723145, + "kl": 0.22882658801972866, + "learning_rate": 9.520000000000001e-07, + "loss": 0.0113, + "num_tokens": 2105626.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 132.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016096575185656548, + "kl": 0.003814885189058259, + "learning_rate": 9.516666666666667e-07, + "loss": 0.0002, + "num_tokens": 2105886.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 132.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006233544554561377, + "kl": 0.0003956861619371921, + "learning_rate": 9.513333333333334e-07, + "loss": 0.0, + "num_tokens": 2106207.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 132.37037037037038, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046236991882324, + "kl": 0.268627118319273, + "learning_rate": 9.510000000000001e-07, + "loss": 0.0064, + "num_tokens": 2106578.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 7148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 132.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016420314786955714, + "kl": 0.0003929436206817627, + "learning_rate": 9.506666666666667e-07, + "loss": 0.0, + "num_tokens": 2106838.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.5, + "completions/mean_terminated_length": 4.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 132.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09429097920656204, + "kl": 0.004155139613430947, + "learning_rate": 9.503333333333333e-07, + "loss": 0.0002, + "num_tokens": 2107056.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 132.42592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9344552755355835, + "kl": 0.03557317424565554, + "learning_rate": 9.5e-07, + "loss": 0.0397, + "num_tokens": 2107375.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 7151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 132.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041324201971292496, + "kl": 0.2614790052175522, + "learning_rate": 9.496666666666666e-07, + "loss": 0.0131, + "num_tokens": 2107679.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 132.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0645325630903244, + "kl": 0.000960037112236023, + "learning_rate": 9.493333333333333e-07, + "loss": 0.0, + "num_tokens": 2107891.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 132.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03375959396362305, + "kl": 0.0027521795127540827, + "learning_rate": 9.49e-07, + "loss": 0.0001, + "num_tokens": 2108218.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 78.0, + "completions/max_terminated_length": 78.0, + "completions/mean_length": 34.0, + "completions/mean_terminated_length": 34.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 132.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6301610469818115, + "kl": 0.022922604344785213, + "learning_rate": 9.486666666666667e-07, + "loss": 0.2837, + "num_tokens": 2108578.0, + "reward": 6.25, + "reward_std": 2.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 2.5, + "step": 7155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 132.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1570698767900467, + "kl": 0.03986375965178013, + "learning_rate": 9.483333333333334e-07, + "loss": 0.002, + "num_tokens": 2108853.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 43.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.0, + "completions/max_terminated_length": 63.0, + "completions/mean_length": 43.25, + "completions/mean_terminated_length": 43.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 132.53703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3675270080566406, + "kl": 0.07046718522906303, + "learning_rate": 9.480000000000001e-07, + "loss": 0.0045, + "num_tokens": 2109250.0, + "reward": 4.0, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 4.041451930999756, + "step": 7157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 132.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015076756477355957, + "kl": 0.0035436644102446735, + "learning_rate": 9.476666666666666e-07, + "loss": 0.0002, + "num_tokens": 2109518.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 132.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048188216984272, + "kl": 0.004340869025327265, + "learning_rate": 9.473333333333333e-07, + "loss": 0.0002, + "num_tokens": 2109795.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 7.0, + "completions/mean_terminated_length": 7.0, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 132.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018355129286646843, + "kl": 0.0006780156109016389, + "learning_rate": 9.47e-07, + "loss": 0.0, + "num_tokens": 2110031.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 132.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0468345507979393, + "kl": 0.007834693882614374, + "learning_rate": 9.466666666666667e-07, + "loss": 0.0004, + "num_tokens": 2110315.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 132.62962962962962, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8543777465820312, + "kl": 0.04336899612098932, + "learning_rate": 9.463333333333335e-07, + "loss": -0.0715, + "num_tokens": 2110607.0, + "reward": 4.0, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 4.041451930999756, + "step": 7162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 132.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016212743939831853, + "kl": 0.0002623516629682854, + "learning_rate": 9.460000000000001e-07, + "loss": 0.0, + "num_tokens": 2110921.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 132.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03348354622721672, + "kl": 0.0027104535838589072, + "learning_rate": 9.456666666666667e-07, + "loss": 0.0001, + "num_tokens": 2111219.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 132.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008736312738619745, + "kl": 0.003747418522834778, + "learning_rate": 9.453333333333334e-07, + "loss": 0.0002, + "num_tokens": 2111455.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 132.7037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.131101608276367, + "kl": 0.05928567633964121, + "learning_rate": 9.450000000000001e-07, + "loss": 0.0233, + "num_tokens": 2111728.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 7166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 132.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020665723830461502, + "kl": 0.006254879139305558, + "learning_rate": 9.446666666666666e-07, + "loss": 0.0003, + "num_tokens": 2112000.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.75, + "completions/mean_terminated_length": 12.75, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 132.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08057216554880142, + "kl": 0.006814703578129411, + "learning_rate": 9.443333333333333e-07, + "loss": 0.0003, + "num_tokens": 2112263.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7168 + }, + { + "clip_ratio/high_max": 0.01315789483487606, + "clip_ratio/high_mean": 0.01315789483487606, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01315789483487606, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 132.75925925925927, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7153453826904297, + "kl": 0.06348188780248165, + "learning_rate": 9.44e-07, + "loss": 0.0156, + "num_tokens": 2112564.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 7169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 132.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00875805038958788, + "kl": 0.0006303263508016244, + "learning_rate": 9.436666666666667e-07, + "loss": 0.0, + "num_tokens": 2112840.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 132.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044893015176057816, + "kl": 0.006651686737313867, + "learning_rate": 9.433333333333334e-07, + "loss": 0.0003, + "num_tokens": 2113110.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 132.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059397123754024506, + "kl": 0.012119903694838285, + "learning_rate": 9.430000000000001e-07, + "loss": 0.0006, + "num_tokens": 2113390.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 132.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003593391156755388, + "kl": 0.00020550936460494995, + "learning_rate": 9.426666666666667e-07, + "loss": 0.0, + "num_tokens": 2113610.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 132.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021664176136255264, + "kl": 0.0019108533742837608, + "learning_rate": 9.423333333333334e-07, + "loss": 0.0001, + "num_tokens": 2113870.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 132.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13996350765228271, + "kl": 0.02745542861521244, + "learning_rate": 9.42e-07, + "loss": 0.0015, + "num_tokens": 2114210.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7175 + }, + { + "clip_ratio/high_max": 0.012195121496915817, + "clip_ratio/high_mean": 0.012195121496915817, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.012195121496915817, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 132.88888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 17.1121768951416, + "kl": 1.65606177598238, + "learning_rate": 9.416666666666667e-07, + "loss": 0.0882, + "num_tokens": 2114518.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 7176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 132.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03864877671003342, + "kl": 0.0028798532439395785, + "learning_rate": 9.413333333333333e-07, + "loss": 0.0001, + "num_tokens": 2114798.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 132.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06892888247966766, + "kl": 0.012835131026804447, + "learning_rate": 9.41e-07, + "loss": 0.0006, + "num_tokens": 2115112.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 36.5, + "completions/mean_terminated_length": 36.5, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 132.94444444444446, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4013471603393555, + "kl": 0.09095125645399094, + "learning_rate": 9.406666666666666e-07, + "loss": -0.0371, + "num_tokens": 2115474.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 7179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 132.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048363130539655685, + "kl": 0.010076596401631832, + "learning_rate": 9.403333333333334e-07, + "loss": 0.0005, + "num_tokens": 2115765.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.5, + "completions/mean_terminated_length": 31.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 132.9814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6886708736419678, + "kl": 0.07726818695664406, + "learning_rate": 9.400000000000001e-07, + "loss": 0.0109, + "num_tokens": 2116171.0, + "reward": 2.375, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 1.25, + "step": 7181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 29.75, + "completions/mean_terminated_length": 29.75, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 133.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08039382845163345, + "kl": 0.018315540626645088, + "learning_rate": 9.396666666666667e-07, + "loss": 0.0009, + "num_tokens": 2116518.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 133.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03761099651455879, + "kl": 0.005296449642628431, + "learning_rate": 9.393333333333334e-07, + "loss": 0.0003, + "num_tokens": 2116786.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 133.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01886494643986225, + "kl": 0.0023565638111904263, + "learning_rate": 9.39e-07, + "loss": 0.0001, + "num_tokens": 2117068.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 133.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13610948622226715, + "kl": 0.01815592311322689, + "learning_rate": 9.386666666666667e-07, + "loss": 0.0009, + "num_tokens": 2117344.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 133.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.31861409544944763, + "kl": 0.03766192775219679, + "learning_rate": 9.383333333333333e-07, + "loss": 0.0019, + "num_tokens": 2117615.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 133.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002420982113108039, + "kl": 2.590566873550415e-05, + "learning_rate": 9.38e-07, + "loss": 0.0, + "num_tokens": 2117827.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 133.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06352825462818146, + "kl": 0.001503325649537146, + "learning_rate": 9.376666666666666e-07, + "loss": 0.0001, + "num_tokens": 2118061.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 133.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03194012865424156, + "kl": 0.011991779319941998, + "learning_rate": 9.373333333333334e-07, + "loss": 0.0006, + "num_tokens": 2118333.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 34.5, + "completions/mean_terminated_length": 34.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 133.14814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7620279788970947, + "kl": 0.09522133693099022, + "learning_rate": 9.370000000000001e-07, + "loss": -0.0202, + "num_tokens": 2118699.0, + "reward": 7.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.875, + "rewards/reward_combined/std": 0.25, + "step": 7190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 133.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08648505061864853, + "kl": 0.006787155929487199, + "learning_rate": 9.366666666666668e-07, + "loss": 0.0003, + "num_tokens": 2119025.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 133.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01592927612364292, + "kl": 0.00040593318408355117, + "learning_rate": 9.363333333333333e-07, + "loss": 0.0, + "num_tokens": 2119305.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 133.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010431910865008831, + "kl": 0.0008702820050530136, + "learning_rate": 9.36e-07, + "loss": 0.0, + "num_tokens": 2119573.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 133.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2150237262248993, + "kl": 0.05814521200954914, + "learning_rate": 9.356666666666667e-07, + "loss": 0.003, + "num_tokens": 2119863.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 133.24074074074073, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.396121501922607, + "kl": 0.033672990277409554, + "learning_rate": 9.353333333333334e-07, + "loss": 0.1315, + "num_tokens": 2120186.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 7195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 133.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009516783989965916, + "kl": 0.00034907087683677673, + "learning_rate": 9.349999999999999e-07, + "loss": 0.0, + "num_tokens": 2120430.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 133.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03322471305727959, + "kl": 0.0030200803303159773, + "learning_rate": 9.346666666666666e-07, + "loss": 0.0002, + "num_tokens": 2120728.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 133.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036663591861724854, + "kl": 0.00414731225464493, + "learning_rate": 9.343333333333334e-07, + "loss": 0.0002, + "num_tokens": 2121030.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 133.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08941317349672318, + "kl": 0.011213188990950584, + "learning_rate": 9.340000000000001e-07, + "loss": 0.0006, + "num_tokens": 2121308.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 93.75, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 93.75, + "completions/mean_terminated_length": 39.66666793823242, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 133.33333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2320513725280762, + "kl": 0.06742130592465401, + "learning_rate": 9.336666666666668e-07, + "loss": 0.0537, + "num_tokens": 2121899.0, + "reward": 4.175000190734863, + "reward_std": 4.4485015869140625, + "rewards/reward_combined/mean": 4.175000190734863, + "rewards/reward_combined/std": 4.4485015869140625, + "step": 7200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 133.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058102354407310486, + "kl": 0.010958315804600716, + "learning_rate": 9.333333333333333e-07, + "loss": 0.0005, + "num_tokens": 2122211.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 133.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034183163195848465, + "kl": 0.1602887436747551, + "learning_rate": 9.33e-07, + "loss": 0.008, + "num_tokens": 2122521.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 133.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08965833485126495, + "kl": 0.012937887106090784, + "learning_rate": 9.326666666666667e-07, + "loss": 0.0007, + "num_tokens": 2122794.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 133.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007982464740052819, + "kl": 0.00376911461353302, + "learning_rate": 9.323333333333334e-07, + "loss": 0.0002, + "num_tokens": 2123030.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 133.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019955292344093323, + "kl": 0.000316062563797459, + "learning_rate": 9.319999999999999e-07, + "loss": 0.0, + "num_tokens": 2123286.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 133.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010446756146848202, + "kl": 0.0006545372307300568, + "learning_rate": 9.316666666666666e-07, + "loss": 0.0, + "num_tokens": 2123546.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 35.5, + "completions/mean_terminated_length": 35.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 133.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06672053039073944, + "kl": 0.02910972759127617, + "learning_rate": 9.313333333333334e-07, + "loss": 0.0015, + "num_tokens": 2123912.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 133.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22056028246879578, + "kl": 0.06442374363541603, + "learning_rate": 9.310000000000001e-07, + "loss": 0.0034, + "num_tokens": 2124246.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 133.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048039067536592484, + "kl": 0.008211891632527113, + "learning_rate": 9.306666666666667e-07, + "loss": 0.0004, + "num_tokens": 2124570.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 133.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09931541234254837, + "kl": 0.022510704584419727, + "learning_rate": 9.303333333333334e-07, + "loss": 0.0012, + "num_tokens": 2124873.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 133.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06466849148273468, + "kl": 0.008924984140321612, + "learning_rate": 9.3e-07, + "loss": 0.0004, + "num_tokens": 2125193.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 133.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004334203898906708, + "kl": 0.0012924571637995541, + "learning_rate": 9.296666666666667e-07, + "loss": 0.0001, + "num_tokens": 2125409.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 133.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001046365941874683, + "kl": 0.0013271399657242, + "learning_rate": 9.293333333333333e-07, + "loss": 0.0001, + "num_tokens": 2125689.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 133.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009608503431081772, + "kl": 0.0035395300947129726, + "learning_rate": 9.289999999999999e-07, + "loss": 0.0002, + "num_tokens": 2125980.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 133.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0517580583691597, + "kl": 0.2592521905899048, + "learning_rate": 9.286666666666666e-07, + "loss": 0.013, + "num_tokens": 2126284.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 133.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04558124765753746, + "kl": 0.04163266532123089, + "learning_rate": 9.283333333333334e-07, + "loss": 0.0021, + "num_tokens": 2126689.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 133.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0026923806872218847, + "kl": 0.00033708091359585524, + "learning_rate": 9.28e-07, + "loss": 0.0, + "num_tokens": 2126951.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 133.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019567392766475677, + "kl": 0.04918581433594227, + "learning_rate": 9.276666666666667e-07, + "loss": 0.0025, + "num_tokens": 2127283.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 133.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037634242326021194, + "kl": 0.004057594807818532, + "learning_rate": 9.273333333333334e-07, + "loss": 0.0002, + "num_tokens": 2127581.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 43.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 43.0, + "completions/mean_terminated_length": 43.0, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 133.7037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.990529179573059, + "kl": 0.016978265717625618, + "learning_rate": 9.27e-07, + "loss": -0.0566, + "num_tokens": 2127973.0, + "reward": 5.25, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 4.5, + "step": 7220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 133.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018729979172348976, + "kl": 0.012191944755613804, + "learning_rate": 9.266666666666667e-07, + "loss": 0.0006, + "num_tokens": 2128233.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 133.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01827273704111576, + "kl": 0.0008070696494542062, + "learning_rate": 9.263333333333333e-07, + "loss": 0.0, + "num_tokens": 2128525.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 133.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22310420870780945, + "kl": 0.021159586030989885, + "learning_rate": 9.26e-07, + "loss": 0.0011, + "num_tokens": 2128791.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 133.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012367135845124722, + "kl": 0.000512710539624095, + "learning_rate": 9.256666666666668e-07, + "loss": 0.0, + "num_tokens": 2129112.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 133.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.0059858545428142e-05, + "kl": 2.086162567138672e-06, + "learning_rate": 9.253333333333335e-07, + "loss": 0.0, + "num_tokens": 2129332.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 133.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06855064630508423, + "kl": 0.020937534049153328, + "learning_rate": 9.25e-07, + "loss": 0.001, + "num_tokens": 2129664.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 133.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03979175537824631, + "kl": 0.008876292733475566, + "learning_rate": 9.246666666666667e-07, + "loss": 0.0005, + "num_tokens": 2130003.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 133.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08385109156370163, + "kl": 0.035915348678827286, + "learning_rate": 9.243333333333334e-07, + "loss": 0.0018, + "num_tokens": 2130305.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 133.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11219722032546997, + "kl": 0.01823048759251833, + "learning_rate": 9.24e-07, + "loss": 0.0009, + "num_tokens": 2130601.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 133.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003798265242949128, + "kl": 0.0012842849391745403, + "learning_rate": 9.236666666666666e-07, + "loss": 0.0001, + "num_tokens": 2130820.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 133.90740740740742, + "frac_reward_zero_std": 0.0, + "grad_norm": 11.193485260009766, + "kl": 0.005961761809885502, + "learning_rate": 9.233333333333333e-07, + "loss": 0.2998, + "num_tokens": 2131047.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 7231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 133.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17145322263240814, + "kl": 0.01510188402608037, + "learning_rate": 9.23e-07, + "loss": 0.0008, + "num_tokens": 2131309.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 133.94444444444446, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0979493856430054, + "kl": 0.21211452782154083, + "learning_rate": 9.226666666666668e-07, + "loss": -0.0084, + "num_tokens": 2131679.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 7233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 133.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017782876268029213, + "kl": 0.0005536973476409912, + "learning_rate": 9.223333333333335e-07, + "loss": 0.0, + "num_tokens": 2131889.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 133.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012977372854948044, + "kl": 0.0006392856448655948, + "learning_rate": 9.22e-07, + "loss": 0.0, + "num_tokens": 2132198.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 134.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03445803374052048, + "kl": 0.0052281885873526335, + "learning_rate": 9.216666666666667e-07, + "loss": 0.0003, + "num_tokens": 2132456.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 134.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016842296347022057, + "kl": 0.002088193374220282, + "learning_rate": 9.213333333333334e-07, + "loss": 0.0001, + "num_tokens": 2132726.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 134.03703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2457070350646973, + "kl": 0.04580681957304478, + "learning_rate": 9.210000000000001e-07, + "loss": 0.0706, + "num_tokens": 2133017.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 7238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 134.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02117704600095749, + "kl": 0.005317485425621271, + "learning_rate": 9.206666666666666e-07, + "loss": 0.0003, + "num_tokens": 2133347.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 134.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02129325270652771, + "kl": 0.00043713750346796587, + "learning_rate": 9.203333333333333e-07, + "loss": 0.0, + "num_tokens": 2133615.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 134.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02226569689810276, + "kl": 0.00281720410566777, + "learning_rate": 9.2e-07, + "loss": 0.0001, + "num_tokens": 2133897.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 35.75, + "completions/mean_terminated_length": 35.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 134.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23504067957401276, + "kl": 0.06554269045591354, + "learning_rate": 9.196666666666668e-07, + "loss": 0.0035, + "num_tokens": 2134264.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 35.75, + "completions/mean_terminated_length": 35.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 134.12962962962962, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.200326919555664, + "kl": 0.16619390342384577, + "learning_rate": 9.193333333333334e-07, + "loss": -0.2013, + "num_tokens": 2134627.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 7243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 134.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010768643580377102, + "kl": 0.0005014777125325054, + "learning_rate": 9.19e-07, + "loss": 0.0, + "num_tokens": 2134947.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 134.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03616509214043617, + "kl": 0.0031698253005743027, + "learning_rate": 9.186666666666667e-07, + "loss": 0.0001, + "num_tokens": 2135224.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 134.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039958786219358444, + "kl": 0.005971988663077354, + "learning_rate": 9.183333333333334e-07, + "loss": 0.0003, + "num_tokens": 2135524.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 134.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004152415785938501, + "kl": 0.00035480037331581116, + "learning_rate": 9.18e-07, + "loss": 0.0, + "num_tokens": 2135768.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 134.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049710825085639954, + "kl": 0.008523158729076385, + "learning_rate": 9.176666666666666e-07, + "loss": 0.0004, + "num_tokens": 2136101.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 134.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022367453202605247, + "kl": 0.15715914964675903, + "learning_rate": 9.173333333333333e-07, + "loss": 0.0079, + "num_tokens": 2136412.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 134.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05105190724134445, + "kl": 0.25952573120594025, + "learning_rate": 9.17e-07, + "loss": 0.013, + "num_tokens": 2136716.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 134.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11837867647409439, + "kl": 0.022399356588721275, + "learning_rate": 9.166666666666667e-07, + "loss": 0.0011, + "num_tokens": 2137006.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 134.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053479500114917755, + "kl": 0.03572300262749195, + "learning_rate": 9.163333333333334e-07, + "loss": 0.0018, + "num_tokens": 2137343.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 134.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022783905267715454, + "kl": 0.0011843194661196321, + "learning_rate": 9.160000000000001e-07, + "loss": 0.0, + "num_tokens": 2137559.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 77.0, + "completions/max_terminated_length": 77.0, + "completions/mean_length": 29.0, + "completions/mean_terminated_length": 29.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 134.33333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.229832172393799, + "kl": 0.02698578219860792, + "learning_rate": 9.156666666666667e-07, + "loss": 0.3546, + "num_tokens": 2137895.0, + "reward": 5.875, + "reward_std": 3.25, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 3.25, + "step": 7254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 134.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026810159906744957, + "kl": 0.002930235117673874, + "learning_rate": 9.153333333333334e-07, + "loss": 0.0001, + "num_tokens": 2138222.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 134.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003228947054594755, + "kl": 0.0002876996877603233, + "learning_rate": 9.15e-07, + "loss": 0.0, + "num_tokens": 2138442.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 134.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014878548681735992, + "kl": 0.0009900970035232604, + "learning_rate": 9.146666666666666e-07, + "loss": 0.0, + "num_tokens": 2138677.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 30.75, + "completions/mean_terminated_length": 30.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 134.40740740740742, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.765254020690918, + "kl": 0.1371312439441681, + "learning_rate": 9.143333333333333e-07, + "loss": 0.1271, + "num_tokens": 2139016.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 7258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 134.42592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.0661940574646, + "kl": 0.0509836096316576, + "learning_rate": 9.14e-07, + "loss": 0.0834, + "num_tokens": 2139317.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 7259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 134.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012088281102478504, + "kl": 0.09811355918645859, + "learning_rate": 9.136666666666667e-07, + "loss": 0.0049, + "num_tokens": 2139689.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 134.46296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.98696231842041, + "kl": 0.07401011791080236, + "learning_rate": 9.133333333333334e-07, + "loss": 0.0665, + "num_tokens": 2139996.0, + "reward": 5.75, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 2.598076105117798, + "step": 7261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 134.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030476661399006844, + "kl": 0.0017551222117617726, + "learning_rate": 9.130000000000001e-07, + "loss": 0.0001, + "num_tokens": 2140290.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 134.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5068140029907227, + "kl": 0.12856233259662986, + "learning_rate": 9.126666666666667e-07, + "loss": 0.0061, + "num_tokens": 2140612.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 134.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0058793858624994755, + "kl": 0.0003757834492716938, + "learning_rate": 9.123333333333333e-07, + "loss": 0.0, + "num_tokens": 2140872.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 134.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046370524913072586, + "kl": 0.0031247527804225683, + "learning_rate": 9.12e-07, + "loss": 0.0002, + "num_tokens": 2141138.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 134.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08554859459400177, + "kl": 0.007047143764793873, + "learning_rate": 9.116666666666667e-07, + "loss": 0.0004, + "num_tokens": 2141398.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 134.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009564424864947796, + "kl": 0.0022046566009521484, + "learning_rate": 9.113333333333333e-07, + "loss": 0.0001, + "num_tokens": 2141614.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 134.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04136870428919792, + "kl": 0.013337010983377695, + "learning_rate": 9.109999999999999e-07, + "loss": 0.0007, + "num_tokens": 2141888.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 36.75, + "completions/mean_terminated_length": 36.75, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 134.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23459157347679138, + "kl": 0.09211067855358124, + "learning_rate": 9.106666666666667e-07, + "loss": 0.0047, + "num_tokens": 2142263.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 29.75, + "completions/mean_terminated_length": 29.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 134.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.816853940486908, + "kl": 0.21860068291425705, + "learning_rate": 9.103333333333334e-07, + "loss": 0.0102, + "num_tokens": 2142610.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 134.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05367870628833771, + "kl": 0.013943172059953213, + "learning_rate": 9.100000000000001e-07, + "loss": 0.0007, + "num_tokens": 2142924.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 134.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054747845977544785, + "kl": 0.0009156376127066324, + "learning_rate": 9.096666666666668e-07, + "loss": 0.0, + "num_tokens": 2143137.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 134.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14112626016139984, + "kl": 0.018187306355684996, + "learning_rate": 9.093333333333333e-07, + "loss": 0.0009, + "num_tokens": 2143411.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 134.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07531093060970306, + "kl": 0.02200077986344695, + "learning_rate": 9.09e-07, + "loss": 0.0011, + "num_tokens": 2143681.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 134.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014039292931556702, + "kl": 0.00014556049791281112, + "learning_rate": 9.086666666666667e-07, + "loss": 0.0, + "num_tokens": 2143937.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 134.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003431979101151228, + "kl": 0.00029359757900238037, + "learning_rate": 9.083333333333332e-07, + "loss": 0.0, + "num_tokens": 2144197.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 134.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004764489829540253, + "kl": 0.00020127611060161144, + "learning_rate": 9.079999999999999e-07, + "loss": 0.0, + "num_tokens": 2144513.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 134.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036810729652643204, + "kl": 0.040626462548971176, + "learning_rate": 9.076666666666667e-07, + "loss": 0.002, + "num_tokens": 2144918.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 134.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.5539378334069625e-05, + "kl": 2.5853514671325684e-06, + "learning_rate": 9.073333333333334e-07, + "loss": 0.0, + "num_tokens": 2145138.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 134.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22406013309955597, + "kl": 0.018947109580039978, + "learning_rate": 9.070000000000001e-07, + "loss": 0.001, + "num_tokens": 2145436.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 134.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05794462189078331, + "kl": 0.006478884955868125, + "learning_rate": 9.066666666666667e-07, + "loss": 0.0003, + "num_tokens": 2145696.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 134.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09144249558448792, + "kl": 0.044348349794745445, + "learning_rate": 9.063333333333333e-07, + "loss": 0.0023, + "num_tokens": 2145987.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 134.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018856236711144447, + "kl": 0.012103037908673286, + "learning_rate": 9.06e-07, + "loss": 0.0006, + "num_tokens": 2146247.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 134.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025597816333174706, + "kl": 0.000511661171913147, + "learning_rate": 9.056666666666667e-07, + "loss": 0.0, + "num_tokens": 2146457.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 134.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13349851965904236, + "kl": 0.0038930486189201474, + "learning_rate": 9.053333333333333e-07, + "loss": 0.0002, + "num_tokens": 2146725.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 134.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017097294330596924, + "kl": 0.004612116841599345, + "learning_rate": 9.050000000000001e-07, + "loss": 0.0002, + "num_tokens": 2147015.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 134.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03454576060175896, + "kl": 0.002968351007439196, + "learning_rate": 9.046666666666668e-07, + "loss": 0.0001, + "num_tokens": 2147317.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 134.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008546649478375912, + "kl": 0.0037566646933555603, + "learning_rate": 9.043333333333334e-07, + "loss": 0.0002, + "num_tokens": 2147553.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 27.0, + "completions/mean_terminated_length": 27.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 134.9814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6142919063568115, + "kl": 0.09330939501523972, + "learning_rate": 9.04e-07, + "loss": 0.103, + "num_tokens": 2147897.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 7289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 135.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05476091057062149, + "kl": 0.022868791595101357, + "learning_rate": 9.036666666666667e-07, + "loss": 0.0011, + "num_tokens": 2148196.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 135.0185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.067389488220215, + "kl": 0.02682997426018119, + "learning_rate": 9.033333333333333e-07, + "loss": 0.2163, + "num_tokens": 2148469.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 7291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 135.03703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8548285961151123, + "kl": 0.11683328449726105, + "learning_rate": 9.03e-07, + "loss": 0.0052, + "num_tokens": 2148841.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 7292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 135.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.28243163228034973, + "kl": 0.06176641955971718, + "learning_rate": 9.026666666666666e-07, + "loss": 0.003, + "num_tokens": 2149162.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 135.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1634560376405716, + "kl": 0.016710239462554455, + "learning_rate": 9.023333333333333e-07, + "loss": 0.0009, + "num_tokens": 2149438.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 135.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016343019902706146, + "kl": 0.00015169084508670494, + "learning_rate": 9.020000000000001e-07, + "loss": 0.0, + "num_tokens": 2149694.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 135.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06792984157800674, + "kl": 0.004617521073669195, + "learning_rate": 9.016666666666668e-07, + "loss": 0.0002, + "num_tokens": 2149971.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 135.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10269705206155777, + "kl": 0.01501919748261571, + "learning_rate": 9.013333333333334e-07, + "loss": 0.0008, + "num_tokens": 2150313.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 37.5, + "completions/mean_terminated_length": 37.5, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 135.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11704476922750473, + "kl": 0.042291984893381596, + "learning_rate": 9.01e-07, + "loss": 0.0017, + "num_tokens": 2150683.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 135.16666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.1805291175842285, + "kl": 0.10104147903621197, + "learning_rate": 9.006666666666667e-07, + "loss": -0.0512, + "num_tokens": 2150981.0, + "reward": 6.75, + "reward_std": 2.5, + "rewards/reward_combined/mean": 6.75, + "rewards/reward_combined/std": 2.5, + "step": 7299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 135.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023492639884352684, + "kl": 0.001130125325289555, + "learning_rate": 9.003333333333334e-07, + "loss": 0.0001, + "num_tokens": 2151216.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 135.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02734074741601944, + "kl": 0.006692580878734589, + "learning_rate": 9e-07, + "loss": 0.0004, + "num_tokens": 2151486.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 135.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00541269313544035, + "kl": 0.00010779500007629395, + "learning_rate": 8.996666666666666e-07, + "loss": 0.0, + "num_tokens": 2151698.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 135.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02181081846356392, + "kl": 0.0005224151827860624, + "learning_rate": 8.993333333333333e-07, + "loss": 0.0, + "num_tokens": 2152014.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 135.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02816421538591385, + "kl": 0.0009589850669726729, + "learning_rate": 8.990000000000001e-07, + "loss": 0.0, + "num_tokens": 2152233.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7304 + }, + { + "clip_ratio/high_max": 0.0059523810632526875, + "clip_ratio/high_mean": 0.0059523810632526875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0059523810632526875, + "completion_length": 36.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 45.0, + "completions/max_terminated_length": 45.0, + "completions/mean_length": 36.25, + "completions/mean_terminated_length": 36.25, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 135.27777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6694557666778564, + "kl": 0.16646184399724007, + "learning_rate": 8.986666666666668e-07, + "loss": -0.0705, + "num_tokens": 2152594.0, + "reward": 4.5, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 4.041451930999756, + "step": 7305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 135.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.008952808158938e-05, + "kl": 2.205371856689453e-06, + "learning_rate": 8.983333333333333e-07, + "loss": 0.0, + "num_tokens": 2152814.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 135.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05522434040904045, + "kl": 0.25461290776729584, + "learning_rate": 8.98e-07, + "loss": 0.0127, + "num_tokens": 2153119.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 135.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03750606253743172, + "kl": 0.00523244240321219, + "learning_rate": 8.976666666666667e-07, + "loss": 0.0003, + "num_tokens": 2153387.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 135.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013643357902765274, + "kl": 0.0016249477630481124, + "learning_rate": 8.973333333333334e-07, + "loss": 0.0001, + "num_tokens": 2153669.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 135.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06837660074234009, + "kl": 0.009281745064072311, + "learning_rate": 8.969999999999999e-07, + "loss": 0.0005, + "num_tokens": 2153960.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 135.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06455466151237488, + "kl": 0.014946409035474062, + "learning_rate": 8.966666666666666e-07, + "loss": 0.0007, + "num_tokens": 2154282.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 135.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06256791949272156, + "kl": 0.010306134354323149, + "learning_rate": 8.963333333333333e-07, + "loss": 0.0005, + "num_tokens": 2154596.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 135.42592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4787709712982178, + "kl": 0.06068704463541508, + "learning_rate": 8.960000000000001e-07, + "loss": 0.1584, + "num_tokens": 2154950.0, + "reward": 6.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 6.5, + "rewards/reward_combined/std": 2.0, + "step": 7313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 135.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06644254922866821, + "kl": 0.013334487099200487, + "learning_rate": 8.956666666666668e-07, + "loss": 0.0007, + "num_tokens": 2155228.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 37.75, + "completions/mean_terminated_length": 37.75, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 135.46296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7882707118988037, + "kl": 0.4084884971380234, + "learning_rate": 8.953333333333334e-07, + "loss": -0.019, + "num_tokens": 2155607.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 7315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 135.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05079388618469238, + "kl": 0.0033153147669509053, + "learning_rate": 8.95e-07, + "loss": 0.0002, + "num_tokens": 2155875.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 135.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05334262549877167, + "kl": 0.009435899555683136, + "learning_rate": 8.946666666666667e-07, + "loss": 0.0004, + "num_tokens": 2156167.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 135.5185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.543372869491577, + "kl": 0.1856471374630928, + "learning_rate": 8.943333333333334e-07, + "loss": 0.0529, + "num_tokens": 2156480.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 7318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 135.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043067336082458496, + "kl": 0.016864736913703382, + "learning_rate": 8.939999999999999e-07, + "loss": 0.0009, + "num_tokens": 2156766.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 34.75, + "completions/mean_terminated_length": 34.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 135.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10759197920560837, + "kl": 0.04988580569624901, + "learning_rate": 8.936666666666666e-07, + "loss": 0.0024, + "num_tokens": 2157129.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 135.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029751647263765335, + "kl": 0.005605928134173155, + "learning_rate": 8.933333333333333e-07, + "loss": 0.0003, + "num_tokens": 2157403.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 135.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1020454540848732, + "kl": 0.010115991695784032, + "learning_rate": 8.930000000000001e-07, + "loss": 0.0003, + "num_tokens": 2157657.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 135.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06299417465925217, + "kl": 0.010090203722938895, + "learning_rate": 8.926666666666667e-07, + "loss": 0.0005, + "num_tokens": 2157984.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 135.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022969236597418785, + "kl": 0.004164924961514771, + "learning_rate": 8.923333333333334e-07, + "loss": 0.0002, + "num_tokens": 2158290.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 2.5, + "completions/mean_terminated_length": 2.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 135.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030619587749242783, + "kl": 0.0006797239184379578, + "learning_rate": 8.92e-07, + "loss": 0.0, + "num_tokens": 2158496.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 135.66666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.761586904525757, + "kl": 0.08104568719863892, + "learning_rate": 8.916666666666667e-07, + "loss": 0.0837, + "num_tokens": 2158796.0, + "reward": 4.5, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 4.041451930999756, + "step": 7326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.0, + "completions/mean_terminated_length": 7.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 135.6851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 10.748947143554688, + "kl": 0.049962278455495834, + "learning_rate": 8.913333333333333e-07, + "loss": -0.1669, + "num_tokens": 2159036.0, + "reward": 3.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 3.875, + "rewards/reward_combined/std": 0.25, + "step": 7327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 135.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06620163470506668, + "kl": 0.011506594251841307, + "learning_rate": 8.91e-07, + "loss": 0.0006, + "num_tokens": 2159362.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 135.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01109394896775484, + "kl": 0.0008406780543737113, + "learning_rate": 8.906666666666666e-07, + "loss": 0.0, + "num_tokens": 2159636.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 135.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04895327240228653, + "kl": 0.006518984911963344, + "learning_rate": 8.903333333333333e-07, + "loss": 0.0003, + "num_tokens": 2159926.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 135.75925925925927, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.219571352005005, + "kl": 0.1056404709815979, + "learning_rate": 8.900000000000001e-07, + "loss": 0.0334, + "num_tokens": 2160265.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 7331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 135.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037076301872730255, + "kl": 0.010937594808638096, + "learning_rate": 8.896666666666667e-07, + "loss": 0.0005, + "num_tokens": 2160587.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 33.25, + "completions/mean_terminated_length": 33.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 135.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026273062452673912, + "kl": 0.03700084798038006, + "learning_rate": 8.893333333333334e-07, + "loss": 0.0019, + "num_tokens": 2161000.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 135.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04364119842648506, + "kl": 0.007080773822963238, + "learning_rate": 8.890000000000001e-07, + "loss": 0.0004, + "num_tokens": 2161268.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 135.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.26789894700050354, + "kl": 0.04030096344649792, + "learning_rate": 8.886666666666667e-07, + "loss": 0.0019, + "num_tokens": 2161559.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 135.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008967450819909573, + "kl": 0.0005120581045048311, + "learning_rate": 8.883333333333333e-07, + "loss": 0.0, + "num_tokens": 2161878.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 135.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008881927351467311, + "kl": 0.0037580057978630066, + "learning_rate": 8.88e-07, + "loss": 0.0002, + "num_tokens": 2162114.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 135.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0976402536034584, + "kl": 0.03780166991055012, + "learning_rate": 8.876666666666666e-07, + "loss": 0.0019, + "num_tokens": 2162416.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 135.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014479395933449268, + "kl": 0.0005550645291805267, + "learning_rate": 8.873333333333333e-07, + "loss": 0.0, + "num_tokens": 2162676.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 135.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024733468890190125, + "kl": 0.001297086477279663, + "learning_rate": 8.87e-07, + "loss": 0.0001, + "num_tokens": 2162888.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 135.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019985897466540337, + "kl": 0.011954582296311855, + "learning_rate": 8.866666666666667e-07, + "loss": 0.0006, + "num_tokens": 2163148.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 135.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002589184558019042, + "kl": 0.0002625075576361269, + "learning_rate": 8.863333333333334e-07, + "loss": 0.0, + "num_tokens": 2163410.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 135.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008975645527243614, + "kl": 0.001971140503883362, + "learning_rate": 8.860000000000001e-07, + "loss": 0.0001, + "num_tokens": 2163626.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 136.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09261632710695267, + "kl": 0.024340140633285046, + "learning_rate": 8.856666666666666e-07, + "loss": 0.0012, + "num_tokens": 2163900.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 136.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006089119706302881, + "kl": 0.00023233643150888383, + "learning_rate": 8.853333333333333e-07, + "loss": 0.0, + "num_tokens": 2164172.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 136.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03549829497933388, + "kl": 0.0017097890377044678, + "learning_rate": 8.85e-07, + "loss": 0.0001, + "num_tokens": 2164470.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 136.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004759668372571468, + "kl": 5.406886339187622e-05, + "learning_rate": 8.846666666666667e-07, + "loss": 0.0, + "num_tokens": 2164682.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 136.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04371841624379158, + "kl": 0.002771449158899486, + "learning_rate": 8.843333333333335e-07, + "loss": 0.0001, + "num_tokens": 2164936.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 136.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09800943732261658, + "kl": 0.017581870779395103, + "learning_rate": 8.840000000000001e-07, + "loss": 0.0009, + "num_tokens": 2165216.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 136.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.9392058253288269, + "kl": 0.1343661308346782, + "learning_rate": 8.836666666666667e-07, + "loss": 0.0073, + "num_tokens": 2165519.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 136.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05032603442668915, + "kl": 0.003250275389291346, + "learning_rate": 8.833333333333334e-07, + "loss": 0.0002, + "num_tokens": 2165787.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 136.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.7639427318936214e-05, + "kl": 2.6226043701171875e-06, + "learning_rate": 8.830000000000001e-07, + "loss": 0.0, + "num_tokens": 2166007.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 136.16666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.272624492645264, + "kl": 0.07689910009503365, + "learning_rate": 8.826666666666666e-07, + "loss": -0.1998, + "num_tokens": 2166319.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 7353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.004807692486792803, + "clip_ratio/low_min": 0.004807692486792803, + "clip_ratio/region_mean": 0.004807692486792803, + "completion_length": 36.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 36.25, + "completions/mean_terminated_length": 36.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 136.1851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5374581813812256, + "kl": 0.026032373309135437, + "learning_rate": 8.823333333333333e-07, + "loss": -0.0007, + "num_tokens": 2166684.0, + "reward": 5.25, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 2.598076105117798, + "step": 7354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 136.2037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.366523027420044, + "kl": 0.01075270283035934, + "learning_rate": 8.82e-07, + "loss": -0.0768, + "num_tokens": 2167019.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 7355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 136.22222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4346952438354492, + "kl": 0.5086937826126814, + "learning_rate": 8.816666666666667e-07, + "loss": 0.0659, + "num_tokens": 2167309.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 7356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 136.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06944682449102402, + "kl": 0.008605902519775555, + "learning_rate": 8.813333333333334e-07, + "loss": 0.0004, + "num_tokens": 2167632.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 136.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015439038397744298, + "kl": 0.00028729066252708435, + "learning_rate": 8.810000000000001e-07, + "loss": 0.0, + "num_tokens": 2167892.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 136.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05211462453007698, + "kl": 0.011499294079840183, + "learning_rate": 8.806666666666667e-07, + "loss": 0.0006, + "num_tokens": 2168220.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 136.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038138214498758316, + "kl": 0.06741504743695259, + "learning_rate": 8.803333333333334e-07, + "loss": 0.0034, + "num_tokens": 2168552.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 136.3148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.326338529586792, + "kl": 0.07935124635696411, + "learning_rate": 8.8e-07, + "loss": 0.0111, + "num_tokens": 2168929.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 7361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 136.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016121793538331985, + "kl": 0.0006826934622949921, + "learning_rate": 8.796666666666667e-07, + "loss": 0.0, + "num_tokens": 2169191.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 136.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003704465925693512, + "kl": 0.00028383731842041016, + "learning_rate": 8.793333333333333e-07, + "loss": 0.0, + "num_tokens": 2169411.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 27.0, + "completions/mean_terminated_length": 27.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 136.37037037037038, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.209465980529785, + "kl": 0.03110265452414751, + "learning_rate": 8.79e-07, + "loss": 0.3676, + "num_tokens": 2169743.0, + "reward": 7.800000190734863, + "reward_std": 0.40000009536743164, + "rewards/reward_combined/mean": 7.800000190734863, + "rewards/reward_combined/std": 0.40000009536743164, + "step": 7364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 136.38888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.920591354370117, + "kl": 0.02079156506806612, + "learning_rate": 8.786666666666666e-07, + "loss": 0.0745, + "num_tokens": 2170074.0, + "reward": 5.25, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 2.598076105117798, + "step": 7365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 2.5, + "completions/mean_terminated_length": 2.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 136.40740740740742, + "frac_reward_zero_std": 0.0, + "grad_norm": 15.74850845336914, + "kl": 0.043286800384521484, + "learning_rate": 8.783333333333334e-07, + "loss": -0.0034, + "num_tokens": 2170280.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 7366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 32.5, + "completions/mean_terminated_length": 32.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 136.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015274131670594215, + "kl": 0.025646859779953957, + "learning_rate": 8.780000000000001e-07, + "loss": 0.0013, + "num_tokens": 2170634.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 136.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006005997769534588, + "kl": 0.00012720064114546403, + "learning_rate": 8.776666666666667e-07, + "loss": 0.0, + "num_tokens": 2170891.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 136.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020352181047201157, + "kl": 0.09921921417117119, + "learning_rate": 8.773333333333334e-07, + "loss": 0.005, + "num_tokens": 2171263.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 136.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04101250320672989, + "kl": 0.002260155975818634, + "learning_rate": 8.77e-07, + "loss": 0.0001, + "num_tokens": 2171531.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 136.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030056556686758995, + "kl": 0.0007235308876261115, + "learning_rate": 8.766666666666667e-07, + "loss": 0.0, + "num_tokens": 2171747.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 136.5185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.645255088806152, + "kl": 0.1835203063674271, + "learning_rate": 8.763333333333333e-07, + "loss": 0.0001, + "num_tokens": 2172019.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 7372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 136.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009333707392215729, + "kl": 0.0005047744052717462, + "learning_rate": 8.76e-07, + "loss": 0.0, + "num_tokens": 2172337.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 136.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0256818775087595, + "kl": 0.0023071628529578447, + "learning_rate": 8.756666666666666e-07, + "loss": 0.0001, + "num_tokens": 2172614.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 136.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0025416959542781115, + "kl": 0.00013458231842378154, + "learning_rate": 8.753333333333334e-07, + "loss": 0.0, + "num_tokens": 2172926.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 136.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03927861899137497, + "kl": 0.0072903805412352085, + "learning_rate": 8.750000000000001e-07, + "loss": 0.0004, + "num_tokens": 2173214.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 136.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00921502336859703, + "kl": 0.0003606189420679584, + "learning_rate": 8.746666666666668e-07, + "loss": 0.0, + "num_tokens": 2173482.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 136.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008314108476042747, + "kl": 0.0016662552952766418, + "learning_rate": 8.743333333333333e-07, + "loss": 0.0001, + "num_tokens": 2173698.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 136.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028388969600200653, + "kl": 0.0032668503699824214, + "learning_rate": 8.74e-07, + "loss": 0.0002, + "num_tokens": 2174018.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 136.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012800322845578194, + "kl": 0.15921640396118164, + "learning_rate": 8.736666666666667e-07, + "loss": 0.0079, + "num_tokens": 2174328.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.25, + "completions/mean_terminated_length": 6.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 136.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047093044966459274, + "kl": 0.001638702000491321, + "learning_rate": 8.733333333333333e-07, + "loss": 0.0001, + "num_tokens": 2174561.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 136.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005203679669648409, + "kl": 0.0008633090765215456, + "learning_rate": 8.729999999999999e-07, + "loss": 0.0, + "num_tokens": 2174845.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 136.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03214145451784134, + "kl": 0.006641737651079893, + "learning_rate": 8.726666666666666e-07, + "loss": 0.0003, + "num_tokens": 2175135.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 136.74074074074073, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6138439178466797, + "kl": 0.013574820943176746, + "learning_rate": 8.723333333333334e-07, + "loss": -0.0353, + "num_tokens": 2175432.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 7384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 41.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 41.0, + "completions/mean_terminated_length": 41.0, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 136.75925925925927, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.073247194290161, + "kl": 0.1852499432861805, + "learning_rate": 8.720000000000001e-07, + "loss": -0.0173, + "num_tokens": 2175812.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 7385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 136.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01346014067530632, + "kl": 0.0003483604086795822, + "learning_rate": 8.716666666666668e-07, + "loss": 0.0, + "num_tokens": 2176055.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 136.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000995326554402709, + "kl": 0.003733530640602112, + "learning_rate": 8.713333333333333e-07, + "loss": 0.0002, + "num_tokens": 2176291.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 136.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011303563602268696, + "kl": 0.007572616450488567, + "learning_rate": 8.71e-07, + "loss": 0.0004, + "num_tokens": 2176563.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 136.83333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.698444366455078, + "kl": 0.1807866357266903, + "learning_rate": 8.706666666666667e-07, + "loss": 0.0232, + "num_tokens": 2176868.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 7389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 136.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04582621529698372, + "kl": 0.003882407210767269, + "learning_rate": 8.703333333333334e-07, + "loss": 0.0002, + "num_tokens": 2177180.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 136.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020384741947054863, + "kl": 0.004229044541716576, + "learning_rate": 8.699999999999999e-07, + "loss": 0.0002, + "num_tokens": 2177436.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 37.5, + "completions/mean_terminated_length": 37.5, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 136.88888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7010905742645264, + "kl": 0.11481644958257675, + "learning_rate": 8.696666666666666e-07, + "loss": 0.0059, + "num_tokens": 2177814.0, + "reward": 7.75, + "reward_std": 0.28867512941360474, + "rewards/reward_combined/mean": 7.75, + "rewards/reward_combined/std": 0.28867512941360474, + "step": 7392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 136.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0655355155467987, + "kl": 0.24870573729276657, + "learning_rate": 8.693333333333334e-07, + "loss": 0.0124, + "num_tokens": 2178122.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 136.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18719778954982758, + "kl": 0.03866432886570692, + "learning_rate": 8.690000000000001e-07, + "loss": 0.0019, + "num_tokens": 2178426.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 136.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0578639917075634, + "kl": 0.04248492605984211, + "learning_rate": 8.686666666666667e-07, + "loss": 0.0021, + "num_tokens": 2178830.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 136.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06568656861782074, + "kl": 0.010178704280406237, + "learning_rate": 8.683333333333334e-07, + "loss": 0.0005, + "num_tokens": 2179126.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 136.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11169084161520004, + "kl": 0.0135839837603271, + "learning_rate": 8.68e-07, + "loss": 0.0007, + "num_tokens": 2179402.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 137.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022460181266069412, + "kl": 0.011400938034057617, + "learning_rate": 8.676666666666667e-07, + "loss": 0.0006, + "num_tokens": 2179662.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7398 + }, + { + "clip_ratio/high_max": 0.006666666828095913, + "clip_ratio/high_mean": 0.006666666828095913, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006666666828095913, + "completion_length": 36.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 36.5, + "completions/mean_terminated_length": 36.5, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 137.0185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.146008014678955, + "kl": 0.2829881012439728, + "learning_rate": 8.673333333333333e-07, + "loss": -0.0069, + "num_tokens": 2180036.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 7399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 137.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028076032176613808, + "kl": 0.0056443470530211926, + "learning_rate": 8.669999999999999e-07, + "loss": 0.0003, + "num_tokens": 2180304.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 137.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049363575875759125, + "kl": 0.008579898159950972, + "learning_rate": 8.666666666666666e-07, + "loss": 0.0004, + "num_tokens": 2180621.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 137.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05836309492588043, + "kl": 0.002380305028054863, + "learning_rate": 8.663333333333334e-07, + "loss": 0.0001, + "num_tokens": 2180889.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 137.09259259259258, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.664955139160156, + "kl": 0.09833686100319028, + "learning_rate": 8.66e-07, + "loss": 0.1315, + "num_tokens": 2181170.0, + "reward": 6.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 6.5, + "rewards/reward_combined/std": 2.0, + "step": 7403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 137.11111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 11.70896053314209, + "kl": 0.5666388068348169, + "learning_rate": 8.656666666666667e-07, + "loss": -0.0912, + "num_tokens": 2181387.0, + "reward": 3.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 3.875, + "rewards/reward_combined/std": 0.25, + "step": 7404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 137.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02353428490459919, + "kl": 0.0005417026113718748, + "learning_rate": 8.653333333333334e-07, + "loss": 0.0, + "num_tokens": 2181621.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 137.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022645464166998863, + "kl": 0.001115315710194409, + "learning_rate": 8.65e-07, + "loss": 0.0001, + "num_tokens": 2181909.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 137.16666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.814314842224121, + "kl": 0.23310255235992372, + "learning_rate": 8.646666666666667e-07, + "loss": 0.0369, + "num_tokens": 2182235.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 7407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 137.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032141633331775665, + "kl": 0.0005283355712890625, + "learning_rate": 8.643333333333333e-07, + "loss": 0.0, + "num_tokens": 2182455.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 137.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06343802064657211, + "kl": 0.004680798389017582, + "learning_rate": 8.64e-07, + "loss": 0.0003, + "num_tokens": 2182686.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 137.22222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 10.0105619430542, + "kl": 0.1006287969648838, + "learning_rate": 8.636666666666668e-07, + "loss": 0.0639, + "num_tokens": 2182963.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 7410 + }, + { + "clip_ratio/high_max": 0.013888888992369175, + "clip_ratio/high_mean": 0.013888888992369175, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.013888888992369175, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 137.24074074074073, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.662911891937256, + "kl": 0.06964758038520813, + "learning_rate": 8.633333333333335e-07, + "loss": 0.2889, + "num_tokens": 2183283.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 7411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 137.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08836774528026581, + "kl": 0.006171148270368576, + "learning_rate": 8.63e-07, + "loss": 0.0003, + "num_tokens": 2183551.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 23.75, + "completions/mean_terminated_length": 23.75, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 137.27777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.435682773590088, + "kl": 0.022472238168120384, + "learning_rate": 8.626666666666667e-07, + "loss": -0.001, + "num_tokens": 2183866.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 7413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 137.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042500365525484085, + "kl": 0.0037860737647861242, + "learning_rate": 8.623333333333334e-07, + "loss": 0.0002, + "num_tokens": 2184160.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 137.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.332398533821106, + "kl": 0.13493806798942387, + "learning_rate": 8.62e-07, + "loss": 0.0078, + "num_tokens": 2184460.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 137.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030837273225188255, + "kl": 0.0015538225416094065, + "learning_rate": 8.616666666666666e-07, + "loss": 0.0001, + "num_tokens": 2184787.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 137.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08426261693239212, + "kl": 0.011116457171738148, + "learning_rate": 8.613333333333333e-07, + "loss": 0.0006, + "num_tokens": 2185118.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 137.37037037037038, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9193925857543945, + "kl": 0.24017392098903656, + "learning_rate": 8.61e-07, + "loss": 0.0303, + "num_tokens": 2185423.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 7418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 137.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006789442617446184, + "kl": 0.16325239092111588, + "learning_rate": 8.606666666666668e-07, + "loss": 0.0082, + "num_tokens": 2185731.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 137.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006305330898612738, + "kl": 0.0008604274480603635, + "learning_rate": 8.603333333333335e-07, + "loss": 0.0, + "num_tokens": 2186015.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 28.75, + "completions/mean_terminated_length": 28.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 137.42592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0816707611083984, + "kl": 0.06952119991183281, + "learning_rate": 8.6e-07, + "loss": 0.0824, + "num_tokens": 2186366.0, + "reward": 6.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 6.5, + "rewards/reward_combined/std": 2.0, + "step": 7421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 137.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1182108223438263, + "kl": 0.039948973804712296, + "learning_rate": 8.596666666666667e-07, + "loss": 0.002, + "num_tokens": 2186712.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 137.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01411399058997631, + "kl": 0.0006649158895015717, + "learning_rate": 8.593333333333334e-07, + "loss": 0.0, + "num_tokens": 2186972.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 23.75, + "completions/mean_terminated_length": 23.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 137.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04029836878180504, + "kl": 0.007934166118502617, + "learning_rate": 8.590000000000001e-07, + "loss": 0.0004, + "num_tokens": 2187319.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 137.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01258003432303667, + "kl": 0.00032490704325027764, + "learning_rate": 8.586666666666666e-07, + "loss": 0.0, + "num_tokens": 2187562.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 137.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03614702448248863, + "kl": 0.0005190700394450687, + "learning_rate": 8.583333333333333e-07, + "loss": 0.0, + "num_tokens": 2187818.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 137.53703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8715689182281494, + "kl": 0.1937999464571476, + "learning_rate": 8.58e-07, + "loss": 0.1032, + "num_tokens": 2188100.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 7427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 137.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05860782414674759, + "kl": 0.007456639315932989, + "learning_rate": 8.576666666666668e-07, + "loss": 0.0004, + "num_tokens": 2188360.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 137.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20272482931613922, + "kl": 0.026359963230788708, + "learning_rate": 8.573333333333334e-07, + "loss": 0.0014, + "num_tokens": 2188663.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 137.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009349181782454252, + "kl": 0.001314584689680487, + "learning_rate": 8.57e-07, + "loss": 0.0001, + "num_tokens": 2188943.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 137.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7454628944396973, + "kl": 0.3071683496236801, + "learning_rate": 8.566666666666667e-07, + "loss": 0.0136, + "num_tokens": 2189239.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 7431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 137.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004477109760046005, + "kl": 0.0004341356980148703, + "learning_rate": 8.563333333333334e-07, + "loss": 0.0, + "num_tokens": 2189559.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 137.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09784694015979767, + "kl": 0.025170376524329185, + "learning_rate": 8.56e-07, + "loss": 0.0014, + "num_tokens": 2189824.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 137.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004328091163188219, + "kl": 0.00010947883129119873, + "learning_rate": 8.556666666666666e-07, + "loss": 0.0, + "num_tokens": 2190036.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.5, + "completions/mean_terminated_length": 4.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 137.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06888636946678162, + "kl": 0.003335043787956238, + "learning_rate": 8.553333333333333e-07, + "loss": 0.0002, + "num_tokens": 2190250.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 32.0, + "completions/mean_terminated_length": 32.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 137.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0348845012485981, + "kl": 0.029288400895893574, + "learning_rate": 8.55e-07, + "loss": 0.0015, + "num_tokens": 2190602.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 10.75, + "completions/mean_terminated_length": 10.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 137.72222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.342391014099121, + "kl": 0.22621587512549013, + "learning_rate": 8.546666666666667e-07, + "loss": 0.0816, + "num_tokens": 2190865.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 7437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 137.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039388205856084824, + "kl": 0.006763803539797664, + "learning_rate": 8.543333333333334e-07, + "loss": 0.0003, + "num_tokens": 2191138.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 137.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02858257293701172, + "kl": 0.00419685392989777, + "learning_rate": 8.540000000000001e-07, + "loss": 0.0002, + "num_tokens": 2191406.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 137.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018522974103689194, + "kl": 0.000549808144569397, + "learning_rate": 8.536666666666667e-07, + "loss": 0.0, + "num_tokens": 2191618.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 26.25, + "completions/mean_terminated_length": 26.25, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 137.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022440942004323006, + "kl": 0.09833554178476334, + "learning_rate": 8.533333333333334e-07, + "loss": 0.0049, + "num_tokens": 2191991.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 137.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04527389258146286, + "kl": 0.003962432034313679, + "learning_rate": 8.53e-07, + "loss": 0.0002, + "num_tokens": 2192305.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 137.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04913357272744179, + "kl": 0.04473540745675564, + "learning_rate": 8.526666666666666e-07, + "loss": 0.0022, + "num_tokens": 2192709.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 137.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008490979089401662, + "kl": 0.0037662237882614136, + "learning_rate": 8.523333333333333e-07, + "loss": 0.0002, + "num_tokens": 2192945.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 137.87037037037038, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.828980922698975, + "kl": 0.041903593111783266, + "learning_rate": 8.52e-07, + "loss": 0.0698, + "num_tokens": 2193251.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 7445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 37.5, + "completions/mean_terminated_length": 37.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 137.88888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9139034748077393, + "kl": 0.16533025354146957, + "learning_rate": 8.516666666666667e-07, + "loss": 0.0101, + "num_tokens": 2193617.0, + "reward": 5.75, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 2.598076105117798, + "step": 7446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 137.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08962789922952652, + "kl": 0.007310988090466708, + "learning_rate": 8.513333333333334e-07, + "loss": 0.0004, + "num_tokens": 2193895.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 137.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04856106638908386, + "kl": 0.013421293813735247, + "learning_rate": 8.510000000000001e-07, + "loss": 0.0007, + "num_tokens": 2194197.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 137.94444444444446, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.592230319976807, + "kl": 1.1883927583694458, + "learning_rate": 8.506666666666667e-07, + "loss": 0.1217, + "num_tokens": 2194498.0, + "reward": 4.625, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 7449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 137.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023714344948530197, + "kl": 0.0006568315147887915, + "learning_rate": 8.503333333333333e-07, + "loss": 0.0, + "num_tokens": 2194762.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 137.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11396137624979019, + "kl": 0.01255968026816845, + "learning_rate": 8.5e-07, + "loss": 0.0007, + "num_tokens": 2195050.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 138.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006025221664458513, + "kl": 0.0002475397413945757, + "learning_rate": 8.496666666666667e-07, + "loss": 0.0, + "num_tokens": 2195359.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 138.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04535014182329178, + "kl": 0.002231706981547177, + "learning_rate": 8.493333333333333e-07, + "loss": 0.0001, + "num_tokens": 2195630.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 36.0, + "completions/mean_terminated_length": 36.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 138.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029302461072802544, + "kl": 0.028821095824241638, + "learning_rate": 8.489999999999999e-07, + "loss": 0.0014, + "num_tokens": 2195998.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 138.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009031427907757461, + "kl": 0.001305686600971967, + "learning_rate": 8.486666666666667e-07, + "loss": 0.0001, + "num_tokens": 2196278.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 138.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05860074982047081, + "kl": 0.011074992828071117, + "learning_rate": 8.483333333333334e-07, + "loss": 0.0005, + "num_tokens": 2196580.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 138.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004128076136112213, + "kl": 0.000924179214052856, + "learning_rate": 8.480000000000001e-07, + "loss": 0.0, + "num_tokens": 2196864.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 138.11111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.072437286376953, + "kl": 0.4014170467853546, + "learning_rate": 8.476666666666668e-07, + "loss": 0.0488, + "num_tokens": 2197170.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 7458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 138.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14057204127311707, + "kl": 0.021488927770406008, + "learning_rate": 8.473333333333333e-07, + "loss": 0.0011, + "num_tokens": 2197468.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 138.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04771888256072998, + "kl": 0.007995693013072014, + "learning_rate": 8.47e-07, + "loss": 0.0004, + "num_tokens": 2197752.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 138.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0066382321529090405, + "kl": 0.00027922044682782143, + "learning_rate": 8.466666666666667e-07, + "loss": 0.0, + "num_tokens": 2198024.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 138.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0036018560640513897, + "kl": 0.0002032928168773651, + "learning_rate": 8.463333333333332e-07, + "loss": 0.0, + "num_tokens": 2198268.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 138.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02132377214729786, + "kl": 0.0009703243558760732, + "learning_rate": 8.459999999999999e-07, + "loss": 0.0001, + "num_tokens": 2198558.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 138.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007851053029298782, + "kl": 0.0007555115735158324, + "learning_rate": 8.456666666666667e-07, + "loss": 0.0, + "num_tokens": 2198836.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 138.24074074074073, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.859008312225342, + "kl": 0.1322932867333293, + "learning_rate": 8.453333333333334e-07, + "loss": -0.0719, + "num_tokens": 2199154.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 7465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 138.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06922364234924316, + "kl": 0.0015659108757972717, + "learning_rate": 8.450000000000001e-07, + "loss": 0.0001, + "num_tokens": 2199374.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 138.27777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.385145902633667, + "kl": 0.07519279047846794, + "learning_rate": 8.446666666666667e-07, + "loss": 0.1105, + "num_tokens": 2199713.0, + "reward": 4.625, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 7467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 138.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14342519640922546, + "kl": 0.019923364743590355, + "learning_rate": 8.443333333333333e-07, + "loss": 0.001, + "num_tokens": 2199982.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 138.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01938001625239849, + "kl": 0.0010086982365464792, + "learning_rate": 8.44e-07, + "loss": 0.0001, + "num_tokens": 2200280.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 138.33333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7041492462158203, + "kl": 0.18741435185074806, + "learning_rate": 8.436666666666667e-07, + "loss": 0.0616, + "num_tokens": 2200603.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 7470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 138.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003147014183923602, + "kl": 0.0002091715796268545, + "learning_rate": 8.433333333333333e-07, + "loss": 0.0, + "num_tokens": 2200865.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 138.37037037037038, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2989468574523926, + "kl": 0.046004125848412514, + "learning_rate": 8.430000000000001e-07, + "loss": 0.0613, + "num_tokens": 2201149.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 7472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 138.38888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 19.241243362426758, + "kl": 0.03809886300587095, + "learning_rate": 8.426666666666668e-07, + "loss": 0.3625, + "num_tokens": 2201373.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 7473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 138.40740740740742, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.816703796386719, + "kl": 0.031998677644878626, + "learning_rate": 8.423333333333334e-07, + "loss": 0.0026, + "num_tokens": 2201649.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 7474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 138.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004953030962496996, + "kl": 0.0004164865240454674, + "learning_rate": 8.42e-07, + "loss": 0.0, + "num_tokens": 2201971.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 138.44444444444446, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.697404861450195, + "kl": 0.006802628748118877, + "learning_rate": 8.416666666666667e-07, + "loss": 0.4424, + "num_tokens": 2202244.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 7476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 138.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037498679012060165, + "kl": 0.006209843559190631, + "learning_rate": 8.413333333333333e-07, + "loss": 0.0003, + "num_tokens": 2202582.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 138.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007692615035921335, + "kl": 0.003784686326980591, + "learning_rate": 8.41e-07, + "loss": 0.0002, + "num_tokens": 2202818.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 138.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08540654927492142, + "kl": 0.011341342236846685, + "learning_rate": 8.406666666666667e-07, + "loss": 0.0006, + "num_tokens": 2203107.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 37.25, + "completions/mean_terminated_length": 37.25, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 138.5185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1784486770629883, + "kl": 0.07143725268542767, + "learning_rate": 8.403333333333333e-07, + "loss": -0.03, + "num_tokens": 2203484.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 7480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 27.0, + "completions/mean_terminated_length": 27.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 138.53703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.930631637573242, + "kl": 0.027482120785862207, + "learning_rate": 8.400000000000001e-07, + "loss": 0.0834, + "num_tokens": 2203812.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 7481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 138.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02112310752272606, + "kl": 0.005166828632354736, + "learning_rate": 8.396666666666668e-07, + "loss": 0.0003, + "num_tokens": 2204084.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 138.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044708251953125, + "kl": 0.008042186964303255, + "learning_rate": 8.393333333333334e-07, + "loss": 0.0004, + "num_tokens": 2204402.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 138.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011828447692096233, + "kl": 0.0004938519632560201, + "learning_rate": 8.39e-07, + "loss": 0.0, + "num_tokens": 2204713.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 138.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0983519554138184, + "kl": 0.30614741519093513, + "learning_rate": 8.386666666666667e-07, + "loss": -0.0641, + "num_tokens": 2205029.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 7485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 138.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.9500738382339478, + "kl": 0.30372533947229385, + "learning_rate": 8.383333333333334e-07, + "loss": 0.0152, + "num_tokens": 2205283.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 138.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025342263281345367, + "kl": 0.0042840738606173545, + "learning_rate": 8.38e-07, + "loss": 0.0002, + "num_tokens": 2205551.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 138.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0386507622897625, + "kl": 0.013293697265908122, + "learning_rate": 8.376666666666666e-07, + "loss": 0.0007, + "num_tokens": 2205825.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 138.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2004091888666153, + "kl": 0.01904579158872366, + "learning_rate": 8.373333333333333e-07, + "loss": 0.0011, + "num_tokens": 2206095.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 23.75, + "completions/mean_terminated_length": 23.75, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 138.7037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.3351521492004395, + "kl": 0.08389908447861671, + "learning_rate": 8.370000000000001e-07, + "loss": -0.0351, + "num_tokens": 2206410.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 7490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 138.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08179929107427597, + "kl": 0.004590657539665699, + "learning_rate": 8.366666666666668e-07, + "loss": 0.0002, + "num_tokens": 2206623.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 138.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06452228128910065, + "kl": 0.019506637006998062, + "learning_rate": 8.363333333333333e-07, + "loss": 0.001, + "num_tokens": 2206923.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 138.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0369425043463707, + "kl": 0.04019300080835819, + "learning_rate": 8.36e-07, + "loss": 0.002, + "num_tokens": 2207328.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 138.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011346984654664993, + "kl": 0.1592487022280693, + "learning_rate": 8.356666666666667e-07, + "loss": 0.008, + "num_tokens": 2207638.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 138.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054627083241939545, + "kl": 0.009328438900411129, + "learning_rate": 8.353333333333334e-07, + "loss": 0.0004, + "num_tokens": 2207931.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 138.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005104490090161562, + "kl": 0.00032470822043251246, + "learning_rate": 8.349999999999999e-07, + "loss": 0.0, + "num_tokens": 2208151.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 138.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023571014404296875, + "kl": 0.011389652732759714, + "learning_rate": 8.346666666666666e-07, + "loss": 0.0006, + "num_tokens": 2208411.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 138.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04267065227031708, + "kl": 0.0036842571571469307, + "learning_rate": 8.343333333333333e-07, + "loss": 0.0002, + "num_tokens": 2208723.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 138.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023598650470376015, + "kl": 0.09938794374465942, + "learning_rate": 8.340000000000001e-07, + "loss": 0.005, + "num_tokens": 2209095.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 138.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014311257749795914, + "kl": 0.0009229793504346162, + "learning_rate": 8.336666666666668e-07, + "loss": 0.0, + "num_tokens": 2209330.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 138.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011054381728172302, + "kl": 0.00014528334213537164, + "learning_rate": 8.333333333333334e-07, + "loss": 0.0, + "num_tokens": 2209586.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 138.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014404000714421272, + "kl": 0.002331082767341286, + "learning_rate": 8.33e-07, + "loss": 0.0001, + "num_tokens": 2209918.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 138.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0030474516097456217, + "kl": 3.0584633350372314e-05, + "learning_rate": 8.326666666666667e-07, + "loss": 0.0, + "num_tokens": 2210130.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7503 + }, + { + "clip_ratio/high_max": 0.006756756920367479, + "clip_ratio/high_mean": 0.006756756920367479, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006756756920367479, + "completion_length": 30.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.0, + "completions/max_terminated_length": 46.0, + "completions/mean_length": 30.5, + "completions/mean_terminated_length": 30.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 138.96296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5553717613220215, + "kl": 0.061494626104831696, + "learning_rate": 8.323333333333334e-07, + "loss": -0.1179, + "num_tokens": 2210488.0, + "reward": 7.625, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.625, + "rewards/reward_combined/std": 0.25, + "step": 7504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.007042253389954567, + "clip_ratio/low_min": 0.007042253389954567, + "clip_ratio/region_mean": 0.007042253389954567, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 138.9814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1578540802001953, + "kl": 0.11879325658082962, + "learning_rate": 8.319999999999999e-07, + "loss": -0.0319, + "num_tokens": 2210829.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 7505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 139.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015871938318014145, + "kl": 0.0006293095648288727, + "learning_rate": 8.316666666666666e-07, + "loss": 0.0, + "num_tokens": 2211089.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 139.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012658831663429737, + "kl": 0.0005571305810008198, + "learning_rate": 8.313333333333333e-07, + "loss": 0.0, + "num_tokens": 2211406.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 139.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06424162536859512, + "kl": 0.00840391730889678, + "learning_rate": 8.310000000000001e-07, + "loss": 0.0004, + "num_tokens": 2211679.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 139.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026806170120835304, + "kl": 0.0011744549265131354, + "learning_rate": 8.306666666666667e-07, + "loss": 0.0001, + "num_tokens": 2211895.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 139.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1453925520181656, + "kl": 0.03526879474520683, + "learning_rate": 8.303333333333334e-07, + "loss": 0.0014, + "num_tokens": 2212212.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 139.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057810500264167786, + "kl": 0.0025528251426294446, + "learning_rate": 8.3e-07, + "loss": 0.0001, + "num_tokens": 2212478.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 139.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016564732417464256, + "kl": 0.1574014574289322, + "learning_rate": 8.296666666666667e-07, + "loss": 0.0079, + "num_tokens": 2212789.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 139.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0217976626008749, + "kl": 0.008066430920735002, + "learning_rate": 8.293333333333333e-07, + "loss": 0.0004, + "num_tokens": 2213079.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 28.75, + "completions/mean_terminated_length": 28.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 139.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040881071239709854, + "kl": 0.011039676610380411, + "learning_rate": 8.29e-07, + "loss": 0.0006, + "num_tokens": 2213422.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 139.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11250057816505432, + "kl": 0.019668866880238056, + "learning_rate": 8.286666666666666e-07, + "loss": 0.001, + "num_tokens": 2213700.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 139.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021708892658352852, + "kl": 0.00024116988060995936, + "learning_rate": 8.283333333333333e-07, + "loss": 0.0, + "num_tokens": 2213957.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 139.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007992224767804146, + "kl": 0.0003065453201998025, + "learning_rate": 8.280000000000001e-07, + "loss": 0.0, + "num_tokens": 2214225.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 139.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0388016402721405, + "kl": 0.0016274884110316634, + "learning_rate": 8.276666666666667e-07, + "loss": 0.0001, + "num_tokens": 2214495.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 139.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06862416118383408, + "kl": 0.014306346885859966, + "learning_rate": 8.273333333333334e-07, + "loss": 0.0007, + "num_tokens": 2214824.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 139.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8594709634780884, + "kl": 0.11305754183558747, + "learning_rate": 8.270000000000001e-07, + "loss": 0.0059, + "num_tokens": 2215085.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 139.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.897463858127594, + "kl": 0.07248111662920564, + "learning_rate": 8.266666666666667e-07, + "loss": 0.0042, + "num_tokens": 2215381.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 139.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07356895506381989, + "kl": 0.014403363689780235, + "learning_rate": 8.263333333333333e-07, + "loss": 0.0007, + "num_tokens": 2215663.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 139.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010495007038116455, + "kl": 0.002083301544189453, + "learning_rate": 8.26e-07, + "loss": 0.0001, + "num_tokens": 2215879.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 139.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00595331285148859, + "kl": 0.0004487338737817481, + "learning_rate": 8.256666666666666e-07, + "loss": 0.0, + "num_tokens": 2216198.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 139.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021645324304699898, + "kl": 0.005602358374744654, + "learning_rate": 8.253333333333333e-07, + "loss": 0.0003, + "num_tokens": 2216466.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 139.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017823046073317528, + "kl": 3.83034348487854e-05, + "learning_rate": 8.25e-07, + "loss": 0.0, + "num_tokens": 2216678.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 139.38888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.866249084472656, + "kl": 0.054738983511924744, + "learning_rate": 8.246666666666667e-07, + "loss": 0.1679, + "num_tokens": 2216983.0, + "reward": 5.25, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 4.5, + "step": 7527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 29.0, + "completions/mean_terminated_length": 29.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 139.40740740740742, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.607938528060913, + "kl": 0.10753445327281952, + "learning_rate": 8.243333333333334e-07, + "loss": 0.0064, + "num_tokens": 2217335.0, + "reward": 5.25, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 2.598076105117798, + "step": 7528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 139.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07217083871364594, + "kl": 0.0256509892642498, + "learning_rate": 8.240000000000001e-07, + "loss": 0.0014, + "num_tokens": 2217646.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 139.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0035971191246062517, + "kl": 9.822547508520074e-05, + "learning_rate": 8.236666666666666e-07, + "loss": 0.0, + "num_tokens": 2217906.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 139.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06426176428794861, + "kl": 0.012222326826304197, + "learning_rate": 8.233333333333333e-07, + "loss": 0.0006, + "num_tokens": 2218210.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 37.0, + "completions/mean_terminated_length": 37.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 139.4814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.294781446456909, + "kl": 0.26684870198369026, + "learning_rate": 8.23e-07, + "loss": 0.0144, + "num_tokens": 2218586.0, + "reward": 5.75, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 2.598076105117798, + "step": 7532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 139.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09070924669504166, + "kl": 0.004909512703306973, + "learning_rate": 8.226666666666667e-07, + "loss": 0.0002, + "num_tokens": 2218882.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 139.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04229502007365227, + "kl": 0.037462251260876656, + "learning_rate": 8.223333333333335e-07, + "loss": 0.0019, + "num_tokens": 2219287.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 139.53703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.072139263153076, + "kl": 0.48522141203284264, + "learning_rate": 8.220000000000001e-07, + "loss": -0.045, + "num_tokens": 2219583.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 7535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 139.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04239751026034355, + "kl": 0.009944356512278318, + "learning_rate": 8.216666666666667e-07, + "loss": 0.0005, + "num_tokens": 2219867.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 139.57407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.610398769378662, + "kl": 0.02635159925557673, + "learning_rate": 8.213333333333334e-07, + "loss": 0.0345, + "num_tokens": 2220198.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 7537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 139.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036886066198349, + "kl": 0.014997601509094238, + "learning_rate": 8.210000000000001e-07, + "loss": 0.0007, + "num_tokens": 2220494.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 139.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016432078555226326, + "kl": 0.0017473031766712666, + "learning_rate": 8.206666666666666e-07, + "loss": 0.0001, + "num_tokens": 2220771.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 139.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044504694640636444, + "kl": 0.006012620055116713, + "learning_rate": 8.203333333333333e-07, + "loss": 0.0003, + "num_tokens": 2221029.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 139.64814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.064578056335449, + "kl": 0.10834954772144556, + "learning_rate": 8.2e-07, + "loss": 0.1237, + "num_tokens": 2221314.0, + "reward": 5.875, + "reward_std": 3.25, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 3.25, + "step": 7541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 139.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03416510671377182, + "kl": 0.000925898551940918, + "learning_rate": 8.196666666666667e-07, + "loss": 0.0, + "num_tokens": 2221524.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 139.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03329979628324509, + "kl": 0.0008929375326260924, + "learning_rate": 8.193333333333334e-07, + "loss": 0.0, + "num_tokens": 2221800.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 139.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03190172091126442, + "kl": 0.007421887246891856, + "learning_rate": 8.190000000000001e-07, + "loss": 0.0003, + "num_tokens": 2222092.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.25, + "completions/mean_terminated_length": 6.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 139.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016833335161209106, + "kl": 0.00046781414130236953, + "learning_rate": 8.186666666666667e-07, + "loss": 0.0, + "num_tokens": 2222325.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 139.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0069707660004496574, + "kl": 0.0012512215180322528, + "learning_rate": 8.183333333333334e-07, + "loss": 0.0001, + "num_tokens": 2222607.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 42.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 42.0, + "completions/mean_terminated_length": 42.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 139.75925925925927, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9212307929992676, + "kl": 0.09565270692110062, + "learning_rate": 8.18e-07, + "loss": 0.0923, + "num_tokens": 2222991.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 7547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 139.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004899625200778246, + "kl": 0.000315505254548043, + "learning_rate": 8.176666666666667e-07, + "loss": 0.0, + "num_tokens": 2223211.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 139.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0034213343169540167, + "kl": 0.00029207393527030945, + "learning_rate": 8.173333333333333e-07, + "loss": 0.0, + "num_tokens": 2223471.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 139.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0565950945019722, + "kl": 0.04421113058924675, + "learning_rate": 8.17e-07, + "loss": 0.0022, + "num_tokens": 2223799.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 139.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007374148699454963, + "kl": 0.0037840083241462708, + "learning_rate": 8.166666666666666e-07, + "loss": 0.0002, + "num_tokens": 2224035.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 139.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.976224899291992, + "kl": 1.206072598695755, + "learning_rate": 8.163333333333334e-07, + "loss": 0.0572, + "num_tokens": 2224338.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 139.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043496474623680115, + "kl": 0.10060855746269226, + "learning_rate": 8.160000000000001e-07, + "loss": 0.005, + "num_tokens": 2224710.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 139.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12133745849132538, + "kl": 0.013637838419526815, + "learning_rate": 8.156666666666667e-07, + "loss": 0.0007, + "num_tokens": 2225014.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 8.75, + "completions/mean_terminated_length": 8.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 139.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.34241989254951477, + "kl": 0.0259231049567461, + "learning_rate": 8.153333333333334e-07, + "loss": 0.0015, + "num_tokens": 2225261.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 139.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02494633011519909, + "kl": 0.010982351377606392, + "learning_rate": 8.15e-07, + "loss": 0.0005, + "num_tokens": 2225521.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 139.94444444444446, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6630780696868896, + "kl": 0.19948132801800966, + "learning_rate": 8.146666666666667e-07, + "loss": 0.0239, + "num_tokens": 2225834.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 7557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 139.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004337610735092312, + "kl": 1.1578202247619629e-05, + "learning_rate": 8.143333333333333e-07, + "loss": 0.0, + "num_tokens": 2226054.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 139.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03504602238535881, + "kl": 0.005426311166957021, + "learning_rate": 8.14e-07, + "loss": 0.0003, + "num_tokens": 2226386.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 36.5, + "completions/mean_terminated_length": 36.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 140.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04597458243370056, + "kl": 0.04685534071177244, + "learning_rate": 8.136666666666666e-07, + "loss": 0.0023, + "num_tokens": 2226756.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 140.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04793095588684082, + "kl": 0.00395410624332726, + "learning_rate": 8.133333333333334e-07, + "loss": 0.0002, + "num_tokens": 2227060.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 140.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02366197481751442, + "kl": 0.09934138134121895, + "learning_rate": 8.130000000000001e-07, + "loss": 0.005, + "num_tokens": 2227432.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 140.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07069484144449234, + "kl": 0.04000316001474857, + "learning_rate": 8.126666666666668e-07, + "loss": 0.002, + "num_tokens": 2227771.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 140.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030317479744553566, + "kl": 0.001375596970319748, + "learning_rate": 8.123333333333333e-07, + "loss": 0.0001, + "num_tokens": 2228039.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 140.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04880096763372421, + "kl": 0.012494937982410192, + "learning_rate": 8.12e-07, + "loss": 0.0006, + "num_tokens": 2228300.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 140.11111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.978727340698242, + "kl": 0.18742307275533676, + "learning_rate": 8.116666666666667e-07, + "loss": -0.0235, + "num_tokens": 2228608.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 7566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 140.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006790049374103546, + "kl": 0.00026611237262841314, + "learning_rate": 8.113333333333333e-07, + "loss": 0.0, + "num_tokens": 2228880.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 26.25, + "completions/mean_terminated_length": 26.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 140.14814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7163357734680176, + "kl": 0.18649326637387276, + "learning_rate": 8.109999999999999e-07, + "loss": 0.0559, + "num_tokens": 2229221.0, + "reward": 6.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 6.5, + "rewards/reward_combined/std": 2.0, + "step": 7568 + }, + { + "clip_ratio/high_max": 0.010416666977107525, + "clip_ratio/high_mean": 0.010416666977107525, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.010416666977107525, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 140.16666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.810575008392334, + "kl": 0.07260394468903542, + "learning_rate": 8.106666666666666e-07, + "loss": -0.1203, + "num_tokens": 2229546.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 7569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 140.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19880366325378418, + "kl": 0.028160014539025724, + "learning_rate": 8.103333333333334e-07, + "loss": 0.002, + "num_tokens": 2229843.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 140.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24680061638355255, + "kl": 0.07979537546634674, + "learning_rate": 8.100000000000001e-07, + "loss": 0.0037, + "num_tokens": 2230159.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 8.25, + "completions/mean_terminated_length": 8.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 140.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.1867517232894897, + "kl": 0.11407772451639175, + "learning_rate": 8.096666666666668e-07, + "loss": 0.006, + "num_tokens": 2230420.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 140.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029743794351816177, + "kl": 0.0008981935679912567, + "learning_rate": 8.093333333333333e-07, + "loss": 0.0, + "num_tokens": 2230664.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 36.5, + "completions/mean_terminated_length": 36.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 140.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03845464065670967, + "kl": 0.0340463537722826, + "learning_rate": 8.09e-07, + "loss": 0.0018, + "num_tokens": 2231034.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 140.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029919151216745377, + "kl": 0.003957441025704611, + "learning_rate": 8.086666666666667e-07, + "loss": 0.0002, + "num_tokens": 2231292.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 140.2962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4280803203582764, + "kl": 0.47196139581501484, + "learning_rate": 8.083333333333334e-07, + "loss": 0.0234, + "num_tokens": 2231614.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 7576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 140.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046749360859394073, + "kl": 0.006110889138653874, + "learning_rate": 8.079999999999999e-07, + "loss": 0.0003, + "num_tokens": 2231945.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 140.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08680707216262817, + "kl": 0.007161391666159034, + "learning_rate": 8.076666666666666e-07, + "loss": 0.0004, + "num_tokens": 2232277.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 35.75, + "completions/mean_terminated_length": 35.75, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 140.35185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4702816009521484, + "kl": 0.5615438558161259, + "learning_rate": 8.073333333333334e-07, + "loss": -0.0502, + "num_tokens": 2232636.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 7579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 140.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007369700470007956, + "kl": 0.0037825629115104675, + "learning_rate": 8.070000000000001e-07, + "loss": 0.0002, + "num_tokens": 2232872.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 140.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1824977695941925, + "kl": 0.027511563152074814, + "learning_rate": 8.066666666666667e-07, + "loss": 0.0014, + "num_tokens": 2233158.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 140.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03374043479561806, + "kl": 0.04851582646369934, + "learning_rate": 8.063333333333333e-07, + "loss": 0.0024, + "num_tokens": 2233562.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 140.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005906059872359037, + "kl": 0.0006824582815170288, + "learning_rate": 8.06e-07, + "loss": 0.0, + "num_tokens": 2233778.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 140.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008622748428024352, + "kl": 0.0012761758989654481, + "learning_rate": 8.056666666666667e-07, + "loss": 0.0001, + "num_tokens": 2234058.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 140.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029156355187296867, + "kl": 0.0010121912637259811, + "learning_rate": 8.053333333333333e-07, + "loss": 0.0001, + "num_tokens": 2234388.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 140.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011493833735585213, + "kl": 0.002943081548437476, + "learning_rate": 8.049999999999999e-07, + "loss": 0.0001, + "num_tokens": 2234679.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 76.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 76.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 140.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.269927740097046, + "kl": 0.2345767617225647, + "learning_rate": 8.046666666666666e-07, + "loss": 0.453, + "num_tokens": 2235223.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 7587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 140.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0193402767181396, + "kl": 0.05985639221034944, + "learning_rate": 8.043333333333334e-07, + "loss": 0.0034, + "num_tokens": 2235499.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 140.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1337231695652008, + "kl": 0.02417577523738146, + "learning_rate": 8.04e-07, + "loss": 0.0014, + "num_tokens": 2235781.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 140.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053484175354242325, + "kl": 0.028386572375893593, + "learning_rate": 8.036666666666667e-07, + "loss": 0.0013, + "num_tokens": 2236083.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 140.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021800415590405464, + "kl": 0.0011058914533350617, + "learning_rate": 8.033333333333334e-07, + "loss": 0.0001, + "num_tokens": 2236359.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 3.75, + "completions/mean_terminated_length": 3.75, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 140.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07727114111185074, + "kl": 0.004364959895610809, + "learning_rate": 8.03e-07, + "loss": 0.0002, + "num_tokens": 2236570.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 140.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18696685135364532, + "kl": 0.020725714042782784, + "learning_rate": 8.026666666666667e-07, + "loss": 0.001, + "num_tokens": 2236852.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 140.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028688820078969002, + "kl": 0.0038562872214242816, + "learning_rate": 8.023333333333333e-07, + "loss": 0.0002, + "num_tokens": 2237140.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 7.0, + "completions/mean_terminated_length": 7.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 140.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17888464033603668, + "kl": 0.008969725575298071, + "learning_rate": 8.02e-07, + "loss": 0.0003, + "num_tokens": 2237388.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 140.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1041543036699295, + "kl": 0.0071145216934382915, + "learning_rate": 8.016666666666668e-07, + "loss": 0.0005, + "num_tokens": 2237615.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 140.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15359792113304138, + "kl": 0.009021283593028784, + "learning_rate": 8.013333333333335e-07, + "loss": 0.0005, + "num_tokens": 2237873.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 140.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030205674469470978, + "kl": 0.0047327810898423195, + "learning_rate": 8.01e-07, + "loss": 0.0002, + "num_tokens": 2238161.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 140.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029939625412225723, + "kl": 0.00027085840702056885, + "learning_rate": 8.006666666666667e-07, + "loss": 0.0, + "num_tokens": 2238373.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 36.25, + "completions/mean_terminated_length": 36.25, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 140.74074074074073, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6898300647735596, + "kl": 0.3640999048948288, + "learning_rate": 8.003333333333334e-07, + "loss": 0.0106, + "num_tokens": 2238746.0, + "reward": 5.75, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 2.598076105117798, + "step": 7600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 32.75, + "completions/mean_terminated_length": 32.75, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 140.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04006919637322426, + "kl": 0.008541662245988846, + "learning_rate": 8e-07, + "loss": 0.0004, + "num_tokens": 2239105.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 140.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01660209149122238, + "kl": 0.00018564164929557592, + "learning_rate": 7.996666666666666e-07, + "loss": 0.0, + "num_tokens": 2239361.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 140.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.30509015917778015, + "kl": 0.04901084862649441, + "learning_rate": 7.993333333333333e-07, + "loss": 0.0023, + "num_tokens": 2239655.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 140.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04473618417978287, + "kl": 0.0024844787549227476, + "learning_rate": 7.99e-07, + "loss": 0.0001, + "num_tokens": 2239951.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 140.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00043820179416798055, + "kl": 1.1593103408813477e-05, + "learning_rate": 7.986666666666668e-07, + "loss": 0.0, + "num_tokens": 2240171.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 140.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17294429242610931, + "kl": 0.033557578921318054, + "learning_rate": 7.983333333333335e-07, + "loss": 0.0018, + "num_tokens": 2240482.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 140.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014695012010633945, + "kl": 0.0003977760788984597, + "learning_rate": 7.98e-07, + "loss": 0.0, + "num_tokens": 2240798.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 140.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15435448288917542, + "kl": 0.025543692521750927, + "learning_rate": 7.976666666666667e-07, + "loss": 0.0013, + "num_tokens": 2241069.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 140.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004698130302131176, + "kl": 0.00029430389986373484, + "learning_rate": 7.973333333333334e-07, + "loss": 0.0, + "num_tokens": 2241289.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 140.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06369496881961823, + "kl": 0.008858399465680122, + "learning_rate": 7.970000000000001e-07, + "loss": 0.0004, + "num_tokens": 2241601.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 140.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011792807839810848, + "kl": 0.0022514045267598704, + "learning_rate": 7.966666666666666e-07, + "loss": 0.0001, + "num_tokens": 2241867.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 140.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02483171783387661, + "kl": 0.006977959303185344, + "learning_rate": 7.963333333333333e-07, + "loss": 0.0003, + "num_tokens": 2242156.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 140.9814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 11.277273178100586, + "kl": 0.03083835239522159, + "learning_rate": 7.96e-07, + "loss": 0.0367, + "num_tokens": 2242390.0, + "reward": 3.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 3.875, + "rewards/reward_combined/std": 0.25, + "step": 7613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 141.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04749931022524834, + "kl": 0.02056124061346054, + "learning_rate": 7.956666666666668e-07, + "loss": 0.0011, + "num_tokens": 2242670.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 141.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018423404544591904, + "kl": 0.0005905196012463421, + "learning_rate": 7.953333333333334e-07, + "loss": 0.0, + "num_tokens": 2242985.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 92.75, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 92.75, + "completions/mean_terminated_length": 38.333335876464844, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 141.03703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6382052898406982, + "kl": 0.1818025279790163, + "learning_rate": 7.95e-07, + "loss": 0.3945, + "num_tokens": 2243572.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 7616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 141.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09395188838243484, + "kl": 0.0018283475656062365, + "learning_rate": 7.946666666666667e-07, + "loss": 0.0001, + "num_tokens": 2243792.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 141.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01620199717581272, + "kl": 0.0007060617208480835, + "learning_rate": 7.943333333333334e-07, + "loss": 0.0, + "num_tokens": 2244052.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.25, + "completions/mean_terminated_length": 7.25, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 141.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03634077310562134, + "kl": 0.0008541723364032805, + "learning_rate": 7.94e-07, + "loss": 0.0, + "num_tokens": 2244293.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 141.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02382316254079342, + "kl": 0.011331932619214058, + "learning_rate": 7.936666666666666e-07, + "loss": 0.0006, + "num_tokens": 2244553.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 141.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02722771093249321, + "kl": 0.00026547908782958984, + "learning_rate": 7.933333333333333e-07, + "loss": 0.0, + "num_tokens": 2244809.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 141.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008668236434459686, + "kl": 0.0005737521569244564, + "learning_rate": 7.93e-07, + "loss": 0.0, + "num_tokens": 2245044.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 24.75, + "completions/mean_terminated_length": 24.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 141.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023655198514461517, + "kl": 0.0043696698267012835, + "learning_rate": 7.926666666666668e-07, + "loss": 0.0002, + "num_tokens": 2245363.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 141.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04898698255419731, + "kl": 0.007389507722109556, + "learning_rate": 7.923333333333334e-07, + "loss": 0.0004, + "num_tokens": 2245647.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 141.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18285970389842987, + "kl": 0.1745816171169281, + "learning_rate": 7.920000000000001e-07, + "loss": 0.0088, + "num_tokens": 2245962.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 141.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1095123142004013, + "kl": 0.021343120373785496, + "learning_rate": 7.916666666666667e-07, + "loss": 0.0012, + "num_tokens": 2246246.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 141.24074074074073, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7966949939727783, + "kl": 0.03962932527065277, + "learning_rate": 7.913333333333334e-07, + "loss": 0.0948, + "num_tokens": 2246523.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 7627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 141.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.36198094487190247, + "kl": 0.06072469800710678, + "learning_rate": 7.91e-07, + "loss": 0.0028, + "num_tokens": 2246822.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 141.27777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3703181743621826, + "kl": 0.210733522195369, + "learning_rate": 7.906666666666666e-07, + "loss": -0.0461, + "num_tokens": 2247107.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 7629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 141.2962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.765426158905029, + "kl": 0.03463748004287481, + "learning_rate": 7.903333333333333e-07, + "loss": 0.0142, + "num_tokens": 2247412.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 7630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 141.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004028107039630413, + "kl": 0.00026410221471451223, + "learning_rate": 7.9e-07, + "loss": 0.0, + "num_tokens": 2247632.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 141.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0638107880949974, + "kl": 0.008552071638405323, + "learning_rate": 7.896666666666667e-07, + "loss": 0.0004, + "num_tokens": 2247944.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 141.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06308624148368835, + "kl": 0.004701980855315924, + "learning_rate": 7.893333333333334e-07, + "loss": 0.0002, + "num_tokens": 2248198.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 141.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03033612109720707, + "kl": 0.0047260463470593095, + "learning_rate": 7.890000000000001e-07, + "loss": 0.0002, + "num_tokens": 2248527.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 141.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02245710790157318, + "kl": 0.002423997357254848, + "learning_rate": 7.886666666666667e-07, + "loss": 0.0001, + "num_tokens": 2248797.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 141.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03912690281867981, + "kl": 0.0017813832382671535, + "learning_rate": 7.883333333333333e-07, + "loss": 0.0001, + "num_tokens": 2249067.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 141.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026895679533481598, + "kl": 0.0020042358519276604, + "learning_rate": 7.88e-07, + "loss": 0.0001, + "num_tokens": 2249357.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 141.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07445089519023895, + "kl": 0.011246025562286377, + "learning_rate": 7.876666666666667e-07, + "loss": 0.0006, + "num_tokens": 2249685.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 28.5, + "completions/mean_terminated_length": 28.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 141.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17501963675022125, + "kl": 0.025519472546875477, + "learning_rate": 7.873333333333333e-07, + "loss": 0.0014, + "num_tokens": 2250027.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 141.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008800389245152473, + "kl": 0.09812876582145691, + "learning_rate": 7.869999999999999e-07, + "loss": 0.0049, + "num_tokens": 2250399.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 141.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09145799279212952, + "kl": 0.0044666125904768705, + "learning_rate": 7.866666666666667e-07, + "loss": 0.0002, + "num_tokens": 2250653.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 141.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026417864486575127, + "kl": 0.0023253896506503224, + "learning_rate": 7.863333333333334e-07, + "loss": 0.0001, + "num_tokens": 2250930.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 30.75, + "completions/mean_terminated_length": 30.75, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 141.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20951484143733978, + "kl": 0.08382711559534073, + "learning_rate": 7.860000000000001e-07, + "loss": 0.0042, + "num_tokens": 2251333.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 141.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009331641718745232, + "kl": 0.00131767155835405, + "learning_rate": 7.856666666666666e-07, + "loss": 0.0001, + "num_tokens": 2251615.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 141.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11620497703552246, + "kl": 0.029218826442956924, + "learning_rate": 7.853333333333333e-07, + "loss": 0.0015, + "num_tokens": 2251926.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 141.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1253441423177719, + "kl": 0.013156628585420549, + "learning_rate": 7.85e-07, + "loss": 0.0007, + "num_tokens": 2252256.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 141.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.1754655838012695, + "kl": 0.05695727467536926, + "learning_rate": 7.846666666666667e-07, + "loss": 0.0295, + "num_tokens": 2252554.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 7647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 2.5, + "completions/mean_terminated_length": 2.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 141.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01952867954969406, + "kl": 0.0005751699209213257, + "learning_rate": 7.843333333333332e-07, + "loss": 0.0, + "num_tokens": 2252760.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 141.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051282916218042374, + "kl": 0.004241932416334748, + "learning_rate": 7.839999999999999e-07, + "loss": 0.0002, + "num_tokens": 2253033.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 141.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00404238561168313, + "kl": 0.00012672245065914467, + "learning_rate": 7.836666666666667e-07, + "loss": 0.0, + "num_tokens": 2253293.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 141.6851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.683503150939941, + "kl": 0.09627540200017393, + "learning_rate": 7.833333333333334e-07, + "loss": 0.1975, + "num_tokens": 2253621.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 7651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 141.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10470376163721085, + "kl": 0.017083127051591873, + "learning_rate": 7.830000000000001e-07, + "loss": 0.0009, + "num_tokens": 2253901.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 141.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12036968767642975, + "kl": 0.003055006265640259, + "learning_rate": 7.826666666666667e-07, + "loss": 0.0001, + "num_tokens": 2254114.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 141.74074074074073, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.06951379776001, + "kl": 0.04319071210920811, + "learning_rate": 7.823333333333333e-07, + "loss": 0.0417, + "num_tokens": 2254390.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 7654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 141.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09180354326963425, + "kl": 0.016905405558645725, + "learning_rate": 7.82e-07, + "loss": 0.0009, + "num_tokens": 2254719.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 141.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03564859554171562, + "kl": 0.0022495443117804825, + "learning_rate": 7.816666666666667e-07, + "loss": 0.0001, + "num_tokens": 2254991.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 141.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12955109775066376, + "kl": 0.04939829558134079, + "learning_rate": 7.813333333333332e-07, + "loss": 0.0025, + "num_tokens": 2255330.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 141.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3093033730983734, + "kl": 0.026199544459814206, + "learning_rate": 7.810000000000001e-07, + "loss": 0.0016, + "num_tokens": 2255599.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 141.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05505216121673584, + "kl": 0.004061909057782032, + "learning_rate": 7.806666666666668e-07, + "loss": 0.0002, + "num_tokens": 2255897.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 37.25, + "completions/mean_terminated_length": 37.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 141.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1282864362001419, + "kl": 0.058737581595778465, + "learning_rate": 7.803333333333334e-07, + "loss": 0.003, + "num_tokens": 2256270.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 141.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007894549635238945, + "kl": 0.0037700235843658447, + "learning_rate": 7.8e-07, + "loss": 0.0002, + "num_tokens": 2256506.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 141.88888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7861123085021973, + "kl": 0.2590033560991287, + "learning_rate": 7.796666666666667e-07, + "loss": 0.1184, + "num_tokens": 2256867.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 7662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.0, + "completions/max_terminated_length": 46.0, + "completions/mean_length": 37.25, + "completions/mean_terminated_length": 37.25, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 141.90740740740742, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.3026018142700195, + "kl": 0.20893632620573044, + "learning_rate": 7.793333333333333e-07, + "loss": 0.1137, + "num_tokens": 2257244.0, + "reward": 6.75, + "reward_std": 2.1794495582580566, + "rewards/reward_combined/mean": 6.75, + "rewards/reward_combined/std": 2.1794495582580566, + "step": 7663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 141.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02406764030456543, + "kl": 0.2635874003171921, + "learning_rate": 7.79e-07, + "loss": 0.0132, + "num_tokens": 2257549.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 141.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16898702085018158, + "kl": 0.01364335953257978, + "learning_rate": 7.786666666666667e-07, + "loss": 0.0006, + "num_tokens": 2257847.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 141.96296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.303480625152588, + "kl": 0.1552797630429268, + "learning_rate": 7.783333333333333e-07, + "loss": 0.0088, + "num_tokens": 2258101.0, + "reward": 3.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 3.875, + "rewards/reward_combined/std": 0.25, + "step": 7666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 141.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006510845851153135, + "kl": 2.022087574005127e-05, + "learning_rate": 7.780000000000001e-07, + "loss": 0.0, + "num_tokens": 2258321.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 142.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09836862981319427, + "kl": 0.011081829317845404, + "learning_rate": 7.776666666666668e-07, + "loss": 0.0006, + "num_tokens": 2258623.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 142.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3293292224407196, + "kl": 0.03504626452922821, + "learning_rate": 7.773333333333334e-07, + "loss": 0.0018, + "num_tokens": 2258934.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 142.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009963270276784897, + "kl": 0.26697002351284027, + "learning_rate": 7.77e-07, + "loss": 0.0133, + "num_tokens": 2259238.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 142.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10451026260852814, + "kl": 0.024246810004115105, + "learning_rate": 7.766666666666667e-07, + "loss": 0.0012, + "num_tokens": 2259532.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.25, + "completions/mean_terminated_length": 12.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 142.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19686958193778992, + "kl": 0.015576696721836925, + "learning_rate": 7.763333333333334e-07, + "loss": 0.0007, + "num_tokens": 2259813.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 142.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05493713542819023, + "kl": 0.002116822579409927, + "learning_rate": 7.76e-07, + "loss": 0.0001, + "num_tokens": 2260077.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 33.5, + "completions/mean_terminated_length": 33.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 142.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19821488857269287, + "kl": 0.04400704731233418, + "learning_rate": 7.756666666666666e-07, + "loss": 0.0014, + "num_tokens": 2260431.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 142.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013925421983003616, + "kl": 0.00039866415318101645, + "learning_rate": 7.753333333333333e-07, + "loss": 0.0, + "num_tokens": 2260747.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 142.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030978256836533546, + "kl": 0.1637900248169899, + "learning_rate": 7.750000000000001e-07, + "loss": 0.0082, + "num_tokens": 2261056.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 142.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059655290096998215, + "kl": 0.0076184822246432304, + "learning_rate": 7.746666666666668e-07, + "loss": 0.0004, + "num_tokens": 2261368.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 142.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0249209962785244, + "kl": 0.004268008982762694, + "learning_rate": 7.743333333333333e-07, + "loss": 0.0002, + "num_tokens": 2261666.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 142.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022615468129515648, + "kl": 0.002672118949703872, + "learning_rate": 7.74e-07, + "loss": 0.0001, + "num_tokens": 2261992.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 142.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005763848894275725, + "kl": 1.6361474990844727e-05, + "learning_rate": 7.736666666666667e-07, + "loss": 0.0, + "num_tokens": 2262212.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 142.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004928158596158028, + "kl": 0.00047995930071920156, + "learning_rate": 7.733333333333334e-07, + "loss": 0.0, + "num_tokens": 2262446.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 142.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021443970501422882, + "kl": 0.0014528706087730825, + "learning_rate": 7.729999999999999e-07, + "loss": 0.0001, + "num_tokens": 2262734.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 142.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003393648425117135, + "kl": 0.00034332985524088144, + "learning_rate": 7.726666666666666e-07, + "loss": 0.0, + "num_tokens": 2263051.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 142.2962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 11.89183235168457, + "kl": 0.16192296892404556, + "learning_rate": 7.723333333333333e-07, + "loss": 0.1482, + "num_tokens": 2263270.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 7684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 142.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027072906494140625, + "kl": 0.00036369860026752576, + "learning_rate": 7.720000000000001e-07, + "loss": 0.0, + "num_tokens": 2263526.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 142.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03711475804448128, + "kl": 0.000505167234223336, + "learning_rate": 7.716666666666668e-07, + "loss": 0.0, + "num_tokens": 2263739.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 142.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.27470842003822327, + "kl": 0.08565065264701843, + "learning_rate": 7.713333333333334e-07, + "loss": 0.0044, + "num_tokens": 2264075.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 36.25, + "completions/mean_terminated_length": 36.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 142.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03703993186354637, + "kl": 0.03506651986390352, + "learning_rate": 7.71e-07, + "loss": 0.0018, + "num_tokens": 2264444.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 142.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019775282591581345, + "kl": 0.0018073072278639302, + "learning_rate": 7.706666666666667e-07, + "loss": 0.0001, + "num_tokens": 2264714.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 142.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04220382869243622, + "kl": 0.011459505651146173, + "learning_rate": 7.703333333333334e-07, + "loss": 0.0006, + "num_tokens": 2265016.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 142.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013074828311800957, + "kl": 0.0005056522786617279, + "learning_rate": 7.699999999999999e-07, + "loss": 0.0, + "num_tokens": 2265276.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 142.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2924010455608368, + "kl": 0.038424568716436625, + "learning_rate": 7.696666666666666e-07, + "loss": 0.0019, + "num_tokens": 2265551.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 142.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2520202696323395, + "kl": 0.038098571822047234, + "learning_rate": 7.693333333333333e-07, + "loss": 0.0021, + "num_tokens": 2265834.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 142.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06367816776037216, + "kl": 0.03997368738055229, + "learning_rate": 7.690000000000001e-07, + "loss": 0.002, + "num_tokens": 2266171.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 142.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.9301414489746094, + "kl": 0.08043321594595909, + "learning_rate": 7.686666666666667e-07, + "loss": 0.0809, + "num_tokens": 2266477.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 7695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 142.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000715200207196176, + "kl": 0.0037872716784477234, + "learning_rate": 7.683333333333334e-07, + "loss": 0.0002, + "num_tokens": 2266713.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 142.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004985891282558441, + "kl": 0.00019570887525333092, + "learning_rate": 7.68e-07, + "loss": 0.0, + "num_tokens": 2266973.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 142.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12844973802566528, + "kl": 0.015208481345325708, + "learning_rate": 7.676666666666667e-07, + "loss": 0.0009, + "num_tokens": 2267288.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 142.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07045193761587143, + "kl": 0.0034580400679260492, + "learning_rate": 7.673333333333333e-07, + "loss": 0.0002, + "num_tokens": 2267560.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 142.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030599597841501236, + "kl": 0.004920025123283267, + "learning_rate": 7.67e-07, + "loss": 0.0003, + "num_tokens": 2267816.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 12.25, + "completions/mean_terminated_length": 12.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 142.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07589931786060333, + "kl": 0.017345746979117393, + "learning_rate": 7.666666666666666e-07, + "loss": 0.001, + "num_tokens": 2268089.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 142.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02332138828933239, + "kl": 0.011409349273890257, + "learning_rate": 7.663333333333333e-07, + "loss": 0.0006, + "num_tokens": 2268349.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 44.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 44.75, + "completions/mean_terminated_length": 44.75, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 142.64814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4340505599975586, + "kl": 0.17629918456077576, + "learning_rate": 7.660000000000001e-07, + "loss": 0.0885, + "num_tokens": 2268744.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 7703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 142.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009740120731294155, + "kl": 0.0014854312175884843, + "learning_rate": 7.656666666666667e-07, + "loss": 0.0001, + "num_tokens": 2269004.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 142.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0739278569817543, + "kl": 0.01706059416756034, + "learning_rate": 7.653333333333334e-07, + "loss": 0.0009, + "num_tokens": 2269328.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 142.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000878873048350215, + "kl": 0.0012810421758331358, + "learning_rate": 7.65e-07, + "loss": 0.0001, + "num_tokens": 2269608.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 142.72222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.551203727722168, + "kl": 0.026009714230895042, + "learning_rate": 7.646666666666667e-07, + "loss": 0.204, + "num_tokens": 2269899.0, + "reward": 5.875, + "reward_std": 3.25, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 3.25, + "step": 7707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 142.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04626341909170151, + "kl": 0.011175478342920542, + "learning_rate": 7.643333333333333e-07, + "loss": 0.0006, + "num_tokens": 2270223.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 142.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04338642209768295, + "kl": 0.0029926609713584185, + "learning_rate": 7.64e-07, + "loss": 0.0001, + "num_tokens": 2270502.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 24.75, + "completions/mean_terminated_length": 24.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 142.77777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.014252185821533, + "kl": 0.02385175507515669, + "learning_rate": 7.636666666666666e-07, + "loss": 0.1599, + "num_tokens": 2270853.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 7710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 142.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02578860893845558, + "kl": 0.0014058202505111694, + "learning_rate": 7.633333333333333e-07, + "loss": 0.0001, + "num_tokens": 2271065.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 142.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029581209644675255, + "kl": 0.0018641411734279245, + "learning_rate": 7.63e-07, + "loss": 0.0001, + "num_tokens": 2271284.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 142.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12762634456157684, + "kl": 0.021816055290400982, + "learning_rate": 7.626666666666667e-07, + "loss": 0.0011, + "num_tokens": 2271570.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 33.0, + "completions/mean_terminated_length": 33.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 142.85185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.193521738052368, + "kl": 0.12909285724163055, + "learning_rate": 7.623333333333334e-07, + "loss": -0.1468, + "num_tokens": 2271930.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 7714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 142.87037037037038, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.3475847244262695, + "kl": 0.09858269989490509, + "learning_rate": 7.620000000000001e-07, + "loss": 0.0093, + "num_tokens": 2272174.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 7715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 142.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0230876374989748, + "kl": 0.0007108896970748901, + "learning_rate": 7.616666666666666e-07, + "loss": 0.0, + "num_tokens": 2272382.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 142.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029099872335791588, + "kl": 0.006785084804505459, + "learning_rate": 7.613333333333333e-07, + "loss": 0.0003, + "num_tokens": 2272654.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 142.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027688313275575638, + "kl": 0.006402334664016962, + "learning_rate": 7.61e-07, + "loss": 0.0003, + "num_tokens": 2272944.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 28.5, + "completions/mean_terminated_length": 28.5, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 142.94444444444446, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.777172565460205, + "kl": 0.13309234008193016, + "learning_rate": 7.606666666666666e-07, + "loss": 0.0841, + "num_tokens": 2273326.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 7719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 30.75, + "completions/mean_terminated_length": 30.75, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 142.96296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7277072668075562, + "kl": 0.2800426259636879, + "learning_rate": 7.603333333333335e-07, + "loss": 0.0065, + "num_tokens": 2273729.0, + "reward": 1.75, + "reward_std": 1.443375587463379, + "rewards/reward_combined/mean": 1.75, + "rewards/reward_combined/std": 1.4433757066726685, + "step": 7720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 142.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006836375687271357, + "kl": 0.0016658224340062588, + "learning_rate": 7.600000000000001e-07, + "loss": 0.0001, + "num_tokens": 2274025.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 143.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06354095786809921, + "kl": 0.007469865377061069, + "learning_rate": 7.596666666666667e-07, + "loss": 0.0003, + "num_tokens": 2274347.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7722 + }, + { + "clip_ratio/high_max": 0.01785714365541935, + "clip_ratio/high_mean": 0.01785714365541935, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01785714365541935, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 143.0185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.344735145568848, + "kl": 0.9647806752473116, + "learning_rate": 7.593333333333334e-07, + "loss": 0.0463, + "num_tokens": 2274619.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 7723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 143.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007394056883640587, + "kl": 2.3886561393737793e-05, + "learning_rate": 7.590000000000001e-07, + "loss": 0.0, + "num_tokens": 2274839.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 143.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03275536373257637, + "kl": 0.005106767290271819, + "learning_rate": 7.586666666666666e-07, + "loss": 0.0003, + "num_tokens": 2275164.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 143.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015481347218155861, + "kl": 0.0004124348779441789, + "learning_rate": 7.583333333333333e-07, + "loss": 0.0, + "num_tokens": 2275407.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 143.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014245212078094482, + "kl": 0.0004931064031552523, + "learning_rate": 7.58e-07, + "loss": 0.0, + "num_tokens": 2275641.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 143.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011389863677322865, + "kl": 0.0024091824889183044, + "learning_rate": 7.576666666666667e-07, + "loss": 0.0001, + "num_tokens": 2275857.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7728 + }, + { + "clip_ratio/high_max": 0.006493506487458944, + "clip_ratio/high_mean": 0.006493506487458944, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006493506487458944, + "completion_length": 36.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 36.5, + "completions/mean_terminated_length": 36.5, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 143.12962962962962, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3838181495666504, + "kl": 0.1302348356693983, + "learning_rate": 7.573333333333334e-07, + "loss": -0.024, + "num_tokens": 2276231.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 7729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 34.0, + "completions/mean_terminated_length": 34.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 143.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04056413844227791, + "kl": 0.03344356641173363, + "learning_rate": 7.570000000000001e-07, + "loss": 0.0016, + "num_tokens": 2276647.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 143.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026046650484204292, + "kl": 0.002310799201950431, + "learning_rate": 7.566666666666667e-07, + "loss": 0.0001, + "num_tokens": 2276924.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 143.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04400316998362541, + "kl": 0.0038310529198497534, + "learning_rate": 7.563333333333334e-07, + "loss": 0.0002, + "num_tokens": 2277196.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 143.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03560112416744232, + "kl": 0.007601238437928259, + "learning_rate": 7.56e-07, + "loss": 0.0004, + "num_tokens": 2277489.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 143.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08714082092046738, + "kl": 0.015921350568532944, + "learning_rate": 7.556666666666667e-07, + "loss": 0.0009, + "num_tokens": 2277763.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 143.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005249094683676958, + "kl": 0.0011807740083895624, + "learning_rate": 7.553333333333333e-07, + "loss": 0.0001, + "num_tokens": 2278023.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 143.25925925925927, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6178362369537354, + "kl": 0.20231213700026274, + "learning_rate": 7.55e-07, + "loss": 0.0118, + "num_tokens": 2278321.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 7736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 143.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005135569255799055, + "kl": 0.0002174973487854004, + "learning_rate": 7.546666666666666e-07, + "loss": 0.0, + "num_tokens": 2278581.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 143.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10283061861991882, + "kl": 0.018414645222947, + "learning_rate": 7.543333333333334e-07, + "loss": 0.001, + "num_tokens": 2278867.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 42.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.0, + "completions/max_terminated_length": 56.0, + "completions/mean_length": 42.5, + "completions/mean_terminated_length": 42.5, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 143.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0893920361995697, + "kl": 0.04302060045301914, + "learning_rate": 7.540000000000001e-07, + "loss": 0.0023, + "num_tokens": 2279261.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 143.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023387538269162178, + "kl": 0.0018110059027094394, + "learning_rate": 7.536666666666667e-07, + "loss": 0.0001, + "num_tokens": 2279557.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 143.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02173694223165512, + "kl": 0.0010395422723377123, + "learning_rate": 7.533333333333334e-07, + "loss": 0.0001, + "num_tokens": 2279831.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7741 + }, + { + "clip_ratio/high_max": 0.008928571827709675, + "clip_ratio/high_mean": 0.008928571827709675, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008928571827709675, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 143.37037037037038, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6484451293945312, + "kl": 0.09152450412511826, + "learning_rate": 7.53e-07, + "loss": 0.0785, + "num_tokens": 2280142.0, + "reward": 7.375, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.375, + "rewards/reward_combined/std": 0.25, + "step": 7742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 143.38888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9953925609588623, + "kl": 0.006105820881202817, + "learning_rate": 7.526666666666667e-07, + "loss": 0.092, + "num_tokens": 2280485.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 7743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 143.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15873970091342926, + "kl": 0.026338991709053516, + "learning_rate": 7.523333333333333e-07, + "loss": 0.0014, + "num_tokens": 2280812.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 143.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012160657905042171, + "kl": 0.007804167224094272, + "learning_rate": 7.52e-07, + "loss": 0.0004, + "num_tokens": 2281084.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 143.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15064144134521484, + "kl": 0.0025098994374275208, + "learning_rate": 7.516666666666666e-07, + "loss": 0.0002, + "num_tokens": 2281294.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 143.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01748109981417656, + "kl": 0.0004294489699532278, + "learning_rate": 7.513333333333334e-07, + "loss": 0.0, + "num_tokens": 2281564.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 143.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024861948564648628, + "kl": 0.006381360813975334, + "learning_rate": 7.510000000000001e-07, + "loss": 0.0003, + "num_tokens": 2281854.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.75, + "completions/mean_terminated_length": 10.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 143.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02653603069484234, + "kl": 0.002868482959456742, + "learning_rate": 7.506666666666668e-07, + "loss": 0.0001, + "num_tokens": 2282125.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 143.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026265760883688927, + "kl": 0.001516878604888916, + "learning_rate": 7.503333333333333e-07, + "loss": 0.0001, + "num_tokens": 2282337.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 143.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02322395145893097, + "kl": 0.011362049262970686, + "learning_rate": 7.5e-07, + "loss": 0.0006, + "num_tokens": 2282597.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 143.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02803313359618187, + "kl": 0.0050208475440740585, + "learning_rate": 7.496666666666667e-07, + "loss": 0.0003, + "num_tokens": 2282901.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 143.57407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.002462148666382, + "kl": 0.042132167145609856, + "learning_rate": 7.493333333333333e-07, + "loss": 0.0558, + "num_tokens": 2283184.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 7753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 32.25, + "completions/mean_terminated_length": 32.25, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 143.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.28658008575439453, + "kl": 0.08020685985684395, + "learning_rate": 7.49e-07, + "loss": 0.0039, + "num_tokens": 2283529.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 143.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8932595252990723, + "kl": 0.023721948266029358, + "learning_rate": 7.486666666666667e-07, + "loss": 0.0008, + "num_tokens": 2283855.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 7755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 143.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014380471780896187, + "kl": 0.15950369089841843, + "learning_rate": 7.483333333333333e-07, + "loss": 0.008, + "num_tokens": 2284165.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 143.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19520796835422516, + "kl": 0.04966457188129425, + "learning_rate": 7.48e-07, + "loss": 0.0025, + "num_tokens": 2284507.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 143.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0037564807571470737, + "kl": 0.0002024372515734285, + "learning_rate": 7.476666666666668e-07, + "loss": 0.0, + "num_tokens": 2284819.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 143.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17725761234760284, + "kl": 0.0379670774564147, + "learning_rate": 7.473333333333333e-07, + "loss": 0.0017, + "num_tokens": 2285164.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 143.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057189084589481354, + "kl": 0.009764651767909527, + "learning_rate": 7.47e-07, + "loss": 0.0005, + "num_tokens": 2285476.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 143.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5905125737190247, + "kl": 0.04659403022378683, + "learning_rate": 7.466666666666667e-07, + "loss": 0.0023, + "num_tokens": 2285808.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 143.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032066769897937775, + "kl": 0.0002530887722969055, + "learning_rate": 7.463333333333334e-07, + "loss": 0.0, + "num_tokens": 2286021.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 143.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012608855031430721, + "kl": 0.0001500278667663224, + "learning_rate": 7.46e-07, + "loss": 0.0, + "num_tokens": 2286277.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.009803921915590763, + "clip_ratio/low_min": 0.009803921915590763, + "clip_ratio/region_mean": 0.009803921915590763, + "completion_length": 26.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 26.25, + "completions/mean_terminated_length": 26.25, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 143.77777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.283246994018555, + "kl": 0.03131023049354553, + "learning_rate": 7.456666666666667e-07, + "loss": 0.0411, + "num_tokens": 2286602.0, + "reward": 2.75, + "reward_std": 2.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 2.5, + "step": 7764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 143.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028160201385617256, + "kl": 0.0009720735251903534, + "learning_rate": 7.453333333333334e-07, + "loss": 0.0, + "num_tokens": 2286862.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 143.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048435479402542114, + "kl": 0.040613481774926186, + "learning_rate": 7.45e-07, + "loss": 0.002, + "num_tokens": 2287172.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 143.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008940286934375763, + "kl": 0.09814896434545517, + "learning_rate": 7.446666666666667e-07, + "loss": 0.0049, + "num_tokens": 2287544.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 143.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004764673765748739, + "kl": 0.0009603133366908878, + "learning_rate": 7.443333333333333e-07, + "loss": 0.0, + "num_tokens": 2287828.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 143.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014307666569948196, + "kl": 0.0007961370865814388, + "learning_rate": 7.44e-07, + "loss": 0.0, + "num_tokens": 2288117.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 143.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004489877261221409, + "kl": 0.0002865791320800781, + "learning_rate": 7.436666666666667e-07, + "loss": 0.0, + "num_tokens": 2288337.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 143.90740740740742, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.451237201690674, + "kl": 0.778635174036026, + "learning_rate": 7.433333333333333e-07, + "loss": 0.0713, + "num_tokens": 2288643.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 7771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 143.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007532176678068936, + "kl": 0.003782317042350769, + "learning_rate": 7.43e-07, + "loss": 0.0002, + "num_tokens": 2288879.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 143.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05567490682005882, + "kl": 0.0043501444888534024, + "learning_rate": 7.426666666666667e-07, + "loss": 0.0002, + "num_tokens": 2289198.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 143.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4712536334991455, + "kl": 0.11329841800034046, + "learning_rate": 7.423333333333334e-07, + "loss": 0.0058, + "num_tokens": 2289532.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 143.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05224059522151947, + "kl": 0.0027415878139436245, + "learning_rate": 7.42e-07, + "loss": 0.0001, + "num_tokens": 2289797.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 144.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04243478178977966, + "kl": 0.01574730360880494, + "learning_rate": 7.416666666666667e-07, + "loss": 0.0008, + "num_tokens": 2290096.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 144.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020603708922863007, + "kl": 0.0033009694889187813, + "learning_rate": 7.413333333333334e-07, + "loss": 0.0002, + "num_tokens": 2290387.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 144.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007747675408609211, + "kl": 0.003777734935283661, + "learning_rate": 7.41e-07, + "loss": 0.0002, + "num_tokens": 2290623.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 144.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004194805398583412, + "kl": 0.0015745406853966415, + "learning_rate": 7.406666666666667e-07, + "loss": 0.0001, + "num_tokens": 2290919.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 144.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09947057068347931, + "kl": 0.004641398787498474, + "learning_rate": 7.403333333333334e-07, + "loss": 0.0002, + "num_tokens": 2291139.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 144.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009100621566176414, + "kl": 0.0005209706723690033, + "learning_rate": 7.4e-07, + "loss": 0.0, + "num_tokens": 2291399.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 144.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07040928304195404, + "kl": 0.011888418346643448, + "learning_rate": 7.396666666666667e-07, + "loss": 0.0006, + "num_tokens": 2291713.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 144.12962962962962, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9421210289001465, + "kl": 0.24796737730503082, + "learning_rate": 7.393333333333334e-07, + "loss": 0.0007, + "num_tokens": 2292084.0, + "reward": 2.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 2.25, + "step": 7783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 144.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027161186560988426, + "kl": 0.0025201529497280717, + "learning_rate": 7.389999999999999e-07, + "loss": 0.0001, + "num_tokens": 2292344.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 144.16666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.503671169281006, + "kl": 0.6986650750041008, + "learning_rate": 7.386666666666667e-07, + "loss": 0.0457, + "num_tokens": 2292658.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 7785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 144.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025138195604085922, + "kl": 0.0013024210929870605, + "learning_rate": 7.383333333333334e-07, + "loss": 0.0001, + "num_tokens": 2292870.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 144.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03400333598256111, + "kl": 0.009914604714140296, + "learning_rate": 7.38e-07, + "loss": 0.0005, + "num_tokens": 2293156.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 144.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15238557755947113, + "kl": 0.025332734920084476, + "learning_rate": 7.376666666666666e-07, + "loss": 0.0013, + "num_tokens": 2293456.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 144.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018772460520267487, + "kl": 0.0027480390563141555, + "learning_rate": 7.373333333333334e-07, + "loss": 0.0001, + "num_tokens": 2293722.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 144.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04934407398104668, + "kl": 0.042495377361774445, + "learning_rate": 7.37e-07, + "loss": 0.0021, + "num_tokens": 2294024.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 144.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011766253970563412, + "kl": 0.002495020627975464, + "learning_rate": 7.366666666666667e-07, + "loss": 0.0001, + "num_tokens": 2294240.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 144.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018804864957928658, + "kl": 0.003282279008999467, + "learning_rate": 7.363333333333333e-07, + "loss": 0.0002, + "num_tokens": 2294530.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 144.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09307824075222015, + "kl": 0.01906169019639492, + "learning_rate": 7.36e-07, + "loss": 0.001, + "num_tokens": 2294802.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 144.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01951969601213932, + "kl": 0.0034652543254196644, + "learning_rate": 7.356666666666667e-07, + "loss": 0.0002, + "num_tokens": 2295123.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 37.5, + "completions/mean_terminated_length": 37.5, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 144.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07716208696365356, + "kl": 0.047879163175821304, + "learning_rate": 7.353333333333334e-07, + "loss": 0.0024, + "num_tokens": 2295497.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 108.0, + "completions/max_terminated_length": 108.0, + "completions/mean_length": 34.5, + "completions/mean_terminated_length": 34.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 144.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1274804174900055, + "kl": 0.013172006234526634, + "learning_rate": 7.350000000000001e-07, + "loss": 0.001, + "num_tokens": 2295855.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 36.75, + "completions/mean_terminated_length": 36.75, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 144.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05018867924809456, + "kl": 0.059575026854872704, + "learning_rate": 7.346666666666666e-07, + "loss": 0.003, + "num_tokens": 2296230.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 144.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05815522000193596, + "kl": 0.01121479517314583, + "learning_rate": 7.343333333333334e-07, + "loss": 0.0005, + "num_tokens": 2296532.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 32.75, + "completions/mean_terminated_length": 32.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 144.42592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3417277336120605, + "kl": 0.14042966067790985, + "learning_rate": 7.34e-07, + "loss": 0.0087, + "num_tokens": 2296879.0, + "reward": 5.75, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 2.598076105117798, + "step": 7799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 7.0, + "completions/mean_terminated_length": 7.0, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 144.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02039925754070282, + "kl": 0.0007336969720199704, + "learning_rate": 7.336666666666667e-07, + "loss": 0.0, + "num_tokens": 2297115.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 144.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040640510618686676, + "kl": 0.0008734017610549927, + "learning_rate": 7.333333333333333e-07, + "loss": 0.0, + "num_tokens": 2297325.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 26.75, + "completions/mean_terminated_length": 26.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 144.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04618072509765625, + "kl": 0.012975095305591822, + "learning_rate": 7.33e-07, + "loss": 0.0006, + "num_tokens": 2297660.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.75, + "completions/mean_terminated_length": 10.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 144.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02459227293729782, + "kl": 0.001953233266249299, + "learning_rate": 7.326666666666667e-07, + "loss": 0.0001, + "num_tokens": 2297931.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 144.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0069475783966481686, + "kl": 0.1613440364599228, + "learning_rate": 7.323333333333334e-07, + "loss": 0.0081, + "num_tokens": 2298240.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 144.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00587360979989171, + "kl": 0.0003445446491241455, + "learning_rate": 7.32e-07, + "loss": 0.0, + "num_tokens": 2298500.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 144.55555555555554, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.115880489349365, + "kl": 0.011238036211580038, + "learning_rate": 7.316666666666666e-07, + "loss": 0.0466, + "num_tokens": 2298835.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 7806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 144.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04636514186859131, + "kl": 0.013248456176370382, + "learning_rate": 7.313333333333334e-07, + "loss": 0.0007, + "num_tokens": 2299109.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 144.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1576414853334427, + "kl": 0.0263519324362278, + "learning_rate": 7.310000000000001e-07, + "loss": 0.0014, + "num_tokens": 2299379.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 144.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004367793910205364, + "kl": 0.0008925153524614871, + "learning_rate": 7.306666666666666e-07, + "loss": 0.0, + "num_tokens": 2299663.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 144.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04627133160829544, + "kl": 0.05415169894695282, + "learning_rate": 7.303333333333333e-07, + "loss": 0.0027, + "num_tokens": 2299999.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 144.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.28215914964675903, + "kl": 0.01585837733000517, + "learning_rate": 7.300000000000001e-07, + "loss": 0.0008, + "num_tokens": 2300334.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 144.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035695239901542664, + "kl": 0.04531832970678806, + "learning_rate": 7.296666666666667e-07, + "loss": 0.0023, + "num_tokens": 2300738.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 144.6851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.765526294708252, + "kl": 0.06695375754497945, + "learning_rate": 7.293333333333334e-07, + "loss": 0.1279, + "num_tokens": 2301075.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 7813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 144.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07994627207517624, + "kl": 0.03670547902584076, + "learning_rate": 7.29e-07, + "loss": 0.0017, + "num_tokens": 2301425.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 144.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09260185062885284, + "kl": 0.016643074341118336, + "learning_rate": 7.286666666666666e-07, + "loss": 0.0008, + "num_tokens": 2301724.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 144.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13086636364459991, + "kl": 0.018740601604804397, + "learning_rate": 7.283333333333334e-07, + "loss": 0.001, + "num_tokens": 2301998.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 144.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004347720183432102, + "kl": 0.00014990071940701455, + "learning_rate": 7.280000000000001e-07, + "loss": 0.0, + "num_tokens": 2302270.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 144.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20595932006835938, + "kl": 0.031236987560987473, + "learning_rate": 7.276666666666666e-07, + "loss": 0.0017, + "num_tokens": 2302601.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 144.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02995908074080944, + "kl": 0.0037928506499156356, + "learning_rate": 7.273333333333333e-07, + "loss": 0.0002, + "num_tokens": 2302818.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 144.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007915023015812039, + "kl": 0.0012798905372619629, + "learning_rate": 7.270000000000001e-07, + "loss": 0.0001, + "num_tokens": 2303098.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 144.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010825767181813717, + "kl": 0.0004008780015283264, + "learning_rate": 7.266666666666667e-07, + "loss": 0.0, + "num_tokens": 2303412.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 144.85185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.8050737380981445, + "kl": 0.9044334143400192, + "learning_rate": 7.263333333333333e-07, + "loss": 0.064, + "num_tokens": 2303685.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 7822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 144.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03513854742050171, + "kl": 0.0018830194603651762, + "learning_rate": 7.26e-07, + "loss": 0.0001, + "num_tokens": 2303957.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 144.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059505682438611984, + "kl": 0.015135690569877625, + "learning_rate": 7.256666666666667e-07, + "loss": 0.0008, + "num_tokens": 2304280.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 144.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05364292487502098, + "kl": 0.0028927183302585036, + "learning_rate": 7.253333333333334e-07, + "loss": 0.0002, + "num_tokens": 2304556.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 144.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003947919700294733, + "kl": 4.607439041137695e-05, + "learning_rate": 7.25e-07, + "loss": 0.0, + "num_tokens": 2304812.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 144.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014857267960906029, + "kl": 0.0005029442836530507, + "learning_rate": 7.246666666666667e-07, + "loss": 0.0, + "num_tokens": 2305061.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 144.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07076448947191238, + "kl": 0.0010878369212150574, + "learning_rate": 7.243333333333333e-07, + "loss": 0.0001, + "num_tokens": 2305273.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 144.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0993768721818924, + "kl": 0.022278862074017525, + "learning_rate": 7.240000000000001e-07, + "loss": 0.0012, + "num_tokens": 2305559.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 145.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01657954230904579, + "kl": 0.2657916694879532, + "learning_rate": 7.236666666666666e-07, + "loss": 0.0133, + "num_tokens": 2305863.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 145.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016393275931477547, + "kl": 0.001748779322952032, + "learning_rate": 7.233333333333333e-07, + "loss": 0.0001, + "num_tokens": 2306140.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 145.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016857551410794258, + "kl": 0.2657901346683502, + "learning_rate": 7.23e-07, + "loss": 0.0133, + "num_tokens": 2306444.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 145.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028873370960354805, + "kl": 0.012613944243639708, + "learning_rate": 7.226666666666667e-07, + "loss": 0.0007, + "num_tokens": 2306718.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.25, + "completions/mean_terminated_length": 6.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 145.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027607008814811707, + "kl": 0.0006582169444300234, + "learning_rate": 7.223333333333334e-07, + "loss": 0.0, + "num_tokens": 2306951.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 29.75, + "completions/mean_terminated_length": 29.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 145.09259259259258, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.604893684387207, + "kl": 0.028711873339489102, + "learning_rate": 7.22e-07, + "loss": 0.0751, + "num_tokens": 2307298.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 7835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 145.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08080136775970459, + "kl": 0.019822733476758003, + "learning_rate": 7.216666666666667e-07, + "loss": 0.001, + "num_tokens": 2307623.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 42.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 42.25, + "completions/mean_terminated_length": 42.25, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 145.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13860845565795898, + "kl": 0.0819106437265873, + "learning_rate": 7.213333333333333e-07, + "loss": 0.0042, + "num_tokens": 2308020.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 145.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024136235937476158, + "kl": 0.001156538724899292, + "learning_rate": 7.210000000000001e-07, + "loss": 0.0001, + "num_tokens": 2308232.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 145.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03194114938378334, + "kl": 0.007562480866909027, + "learning_rate": 7.206666666666667e-07, + "loss": 0.0004, + "num_tokens": 2308521.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 145.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01061325054615736, + "kl": 0.0002386033520451747, + "learning_rate": 7.203333333333333e-07, + "loss": 0.0, + "num_tokens": 2308791.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 145.2037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.523732662200928, + "kl": 0.09044819604605436, + "learning_rate": 7.2e-07, + "loss": 0.0406, + "num_tokens": 2309089.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 7841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 145.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003621538169682026, + "kl": 0.0014243125915527344, + "learning_rate": 7.196666666666668e-07, + "loss": 0.0001, + "num_tokens": 2309349.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 145.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03520356863737106, + "kl": 0.04522598721086979, + "learning_rate": 7.193333333333333e-07, + "loss": 0.0023, + "num_tokens": 2309753.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 145.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020660918205976486, + "kl": 0.011906451545655727, + "learning_rate": 7.19e-07, + "loss": 0.0006, + "num_tokens": 2310013.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 145.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012060144916176796, + "kl": 0.0073663960210978985, + "learning_rate": 7.186666666666667e-07, + "loss": 0.0004, + "num_tokens": 2310285.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 145.2962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.119589805603027, + "kl": 0.026283381041139364, + "learning_rate": 7.183333333333333e-07, + "loss": 0.0258, + "num_tokens": 2310614.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 7846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 145.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005029130261391401, + "kl": 1.4007091522216797e-05, + "learning_rate": 7.18e-07, + "loss": 0.0, + "num_tokens": 2310834.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 145.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029953503981232643, + "kl": 0.0006314888596534729, + "learning_rate": 7.176666666666667e-07, + "loss": 0.0, + "num_tokens": 2311042.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 145.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011209409683942795, + "kl": 0.0023278817534446716, + "learning_rate": 7.173333333333333e-07, + "loss": 0.0001, + "num_tokens": 2311258.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 145.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05701678991317749, + "kl": 0.00676327757537365, + "learning_rate": 7.17e-07, + "loss": 0.0003, + "num_tokens": 2311562.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 145.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006832170765846968, + "kl": 0.0004152357578277588, + "learning_rate": 7.166666666666668e-07, + "loss": 0.0, + "num_tokens": 2311822.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 145.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0734308585524559, + "kl": 0.02048661932349205, + "learning_rate": 7.163333333333333e-07, + "loss": 0.001, + "num_tokens": 2312116.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 145.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06470665335655212, + "kl": 0.018842053599655628, + "learning_rate": 7.16e-07, + "loss": 0.0009, + "num_tokens": 2312390.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 145.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06588394194841385, + "kl": 0.007601095596328378, + "learning_rate": 7.156666666666667e-07, + "loss": 0.0004, + "num_tokens": 2312704.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 145.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023730188608169556, + "kl": 0.003889709711074829, + "learning_rate": 7.153333333333334e-07, + "loss": 0.0002, + "num_tokens": 2312977.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 145.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017198164016008377, + "kl": 0.0042624999769032, + "learning_rate": 7.15e-07, + "loss": 0.0002, + "num_tokens": 2313315.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 145.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008212963584810495, + "kl": 0.0037628933787345886, + "learning_rate": 7.146666666666667e-07, + "loss": 0.0002, + "num_tokens": 2313551.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 145.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019984684884548187, + "kl": 0.0012029930367134511, + "learning_rate": 7.143333333333334e-07, + "loss": 0.0001, + "num_tokens": 2313874.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 145.53703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.683157920837402, + "kl": 0.17300502955913544, + "learning_rate": 7.14e-07, + "loss": 0.1559, + "num_tokens": 2314197.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 7859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 145.55555555555554, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.800112247467041, + "kl": 0.006968340370804071, + "learning_rate": 7.136666666666667e-07, + "loss": 0.085, + "num_tokens": 2314470.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 7860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 145.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04471684247255325, + "kl": 0.04221216402947903, + "learning_rate": 7.133333333333333e-07, + "loss": 0.0021, + "num_tokens": 2314770.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 145.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03296776860952377, + "kl": 0.003319399431347847, + "learning_rate": 7.13e-07, + "loss": 0.0002, + "num_tokens": 2315071.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 145.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05441904440522194, + "kl": 0.012024045921862125, + "learning_rate": 7.126666666666667e-07, + "loss": 0.0006, + "num_tokens": 2315379.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 145.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08604150265455246, + "kl": 0.0030240335618145764, + "learning_rate": 7.123333333333333e-07, + "loss": 0.0001, + "num_tokens": 2315651.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 37.25, + "completions/mean_terminated_length": 37.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 145.64814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4204092025756836, + "kl": 0.11133990064263344, + "learning_rate": 7.12e-07, + "loss": 0.0252, + "num_tokens": 2316016.0, + "reward": 4.625, + "reward_std": 4.308422088623047, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 4.308422088623047, + "step": 7865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 145.66666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4855213165283203, + "kl": 0.07751660235226154, + "learning_rate": 7.116666666666667e-07, + "loss": 0.0039, + "num_tokens": 2316388.0, + "reward": 3.375, + "reward_std": 0.25, + "rewards/reward_combined/mean": 3.375, + "rewards/reward_combined/std": 0.25, + "step": 7866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 145.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008625567890703678, + "kl": 0.00034084319486282766, + "learning_rate": 7.113333333333334e-07, + "loss": 0.0, + "num_tokens": 2316699.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 24.25, + "completions/mean_terminated_length": 24.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 145.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04620688781142235, + "kl": 0.057636771351099014, + "learning_rate": 7.11e-07, + "loss": 0.0029, + "num_tokens": 2317032.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 145.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006471026688814163, + "kl": 0.0004990100860595703, + "learning_rate": 7.106666666666667e-07, + "loss": 0.0, + "num_tokens": 2317292.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 145.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019478855654597282, + "kl": 0.002590879797935486, + "learning_rate": 7.103333333333334e-07, + "loss": 0.0001, + "num_tokens": 2317580.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 145.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012735706754028797, + "kl": 0.0022063918877393007, + "learning_rate": 7.1e-07, + "loss": 0.0001, + "num_tokens": 2317876.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 145.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16797488927841187, + "kl": 0.0031422898173332214, + "learning_rate": 7.096666666666667e-07, + "loss": 0.0002, + "num_tokens": 2318088.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 145.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033851008862257004, + "kl": 0.0063656826969236135, + "learning_rate": 7.093333333333334e-07, + "loss": 0.0003, + "num_tokens": 2318421.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 37.5, + "completions/mean_terminated_length": 37.5, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 145.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0365244597196579, + "kl": 0.043304454535245895, + "learning_rate": 7.09e-07, + "loss": 0.0023, + "num_tokens": 2318795.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 145.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12554596364498138, + "kl": 0.007517669582739472, + "learning_rate": 7.086666666666667e-07, + "loss": 0.0004, + "num_tokens": 2319081.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 145.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0053313616663217545, + "kl": 0.0004590122262015939, + "learning_rate": 7.083333333333334e-07, + "loss": 0.0, + "num_tokens": 2319330.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 145.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09370862692594528, + "kl": 0.00645895441994071, + "learning_rate": 7.079999999999999e-07, + "loss": 0.0003, + "num_tokens": 2319592.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 145.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037812262773513794, + "kl": 0.00982398958876729, + "learning_rate": 7.076666666666667e-07, + "loss": 0.0006, + "num_tokens": 2319918.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.5, + "completions/mean_terminated_length": 5.5, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 145.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06566604226827621, + "kl": 0.001881853153463453, + "learning_rate": 7.073333333333334e-07, + "loss": 0.0001, + "num_tokens": 2320140.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 145.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05149158462882042, + "kl": 0.0015288891954696737, + "learning_rate": 7.07e-07, + "loss": 0.0001, + "num_tokens": 2320397.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 145.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09078837186098099, + "kl": 0.010929904878139496, + "learning_rate": 7.066666666666666e-07, + "loss": 0.0005, + "num_tokens": 2320668.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 145.96296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.645307540893555, + "kl": 0.04936479404568672, + "learning_rate": 7.063333333333334e-07, + "loss": 0.2113, + "num_tokens": 2321016.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 7882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 145.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12648501992225647, + "kl": 0.03250580746680498, + "learning_rate": 7.06e-07, + "loss": 0.0017, + "num_tokens": 2321305.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 146.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029397668316960335, + "kl": 0.004287875635782257, + "learning_rate": 7.056666666666667e-07, + "loss": 0.0002, + "num_tokens": 2321573.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 146.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014732036739587784, + "kl": 0.0015842552529647946, + "learning_rate": 7.053333333333333e-07, + "loss": 0.0001, + "num_tokens": 2321847.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 146.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09335663169622421, + "kl": 0.005483564687892795, + "learning_rate": 7.05e-07, + "loss": 0.0003, + "num_tokens": 2322107.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 146.05555555555554, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5405759811401367, + "kl": 0.047873176634311676, + "learning_rate": 7.046666666666667e-07, + "loss": 0.0831, + "num_tokens": 2322442.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 7887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 27.75, + "completions/mean_terminated_length": 27.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 146.07407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6981112957000732, + "kl": 0.006339870858937502, + "learning_rate": 7.043333333333334e-07, + "loss": 0.2224, + "num_tokens": 2322805.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 7888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 146.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036600496619939804, + "kl": 0.0019297749386169016, + "learning_rate": 7.040000000000001e-07, + "loss": 0.0001, + "num_tokens": 2323073.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 146.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11252889037132263, + "kl": 0.02112907450646162, + "learning_rate": 7.036666666666666e-07, + "loss": 0.001, + "num_tokens": 2323378.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 146.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02057185396552086, + "kl": 0.011972520500421524, + "learning_rate": 7.033333333333334e-07, + "loss": 0.0006, + "num_tokens": 2323638.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 146.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01874908246099949, + "kl": 0.004027571063488722, + "learning_rate": 7.03e-07, + "loss": 0.0002, + "num_tokens": 2323896.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 146.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007877022144384682, + "kl": 0.0037712156772613525, + "learning_rate": 7.026666666666667e-07, + "loss": 0.0002, + "num_tokens": 2324132.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 146.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09150916337966919, + "kl": 0.03694234415888786, + "learning_rate": 7.023333333333333e-07, + "loss": 0.0018, + "num_tokens": 2324448.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 37.5, + "completions/mean_terminated_length": 37.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 146.2037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1805944442749023, + "kl": 0.14544594287872314, + "learning_rate": 7.02e-07, + "loss": 0.0437, + "num_tokens": 2324822.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 7895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 146.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03395191207528114, + "kl": 0.1617203652858734, + "learning_rate": 7.016666666666667e-07, + "loss": 0.0081, + "num_tokens": 2325132.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 146.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005269645480439067, + "kl": 2.358853816986084e-05, + "learning_rate": 7.013333333333334e-07, + "loss": 0.0, + "num_tokens": 2325344.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 146.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04336988925933838, + "kl": 0.006349961506202817, + "learning_rate": 7.01e-07, + "loss": 0.0003, + "num_tokens": 2325637.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 146.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09106840938329697, + "kl": 0.006232064217329025, + "learning_rate": 7.006666666666666e-07, + "loss": 0.0003, + "num_tokens": 2325852.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 146.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028742168098688126, + "kl": 0.00420410861261189, + "learning_rate": 7.003333333333334e-07, + "loss": 0.0002, + "num_tokens": 2326196.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 146.3148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8615872859954834, + "kl": 0.01856350596062839, + "learning_rate": 7.000000000000001e-07, + "loss": 0.0004, + "num_tokens": 2326525.0, + "reward": 5.75, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 4.5, + "step": 7901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 146.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03740246221423149, + "kl": 0.0005555689131142572, + "learning_rate": 6.996666666666666e-07, + "loss": 0.0, + "num_tokens": 2326781.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 146.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02081618830561638, + "kl": 0.047073764726519585, + "learning_rate": 6.993333333333333e-07, + "loss": 0.0024, + "num_tokens": 2327185.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 37.0, + "completions/mean_terminated_length": 37.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 146.37037037037038, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.065969228744507, + "kl": 0.07833204790949821, + "learning_rate": 6.990000000000001e-07, + "loss": -0.0042, + "num_tokens": 2327561.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 7904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 146.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022117936983704567, + "kl": 0.001973973121494055, + "learning_rate": 6.986666666666667e-07, + "loss": 0.0001, + "num_tokens": 2327857.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 146.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03592696040868759, + "kl": 0.003980218549259007, + "learning_rate": 6.983333333333334e-07, + "loss": 0.0002, + "num_tokens": 2328147.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 146.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016566133126616478, + "kl": 0.26588982343673706, + "learning_rate": 6.98e-07, + "loss": 0.0133, + "num_tokens": 2328451.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 146.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009487488307058811, + "kl": 0.00019461263946141116, + "learning_rate": 6.976666666666666e-07, + "loss": 0.0, + "num_tokens": 2328721.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 146.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005486609879881144, + "kl": 0.0003608107508625835, + "learning_rate": 6.973333333333334e-07, + "loss": 0.0, + "num_tokens": 2328941.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 146.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005849889479577541, + "kl": 0.001218301069457084, + "learning_rate": 6.970000000000001e-07, + "loss": 0.0001, + "num_tokens": 2329201.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 146.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010366600006818771, + "kl": 0.0013481086352840066, + "learning_rate": 6.966666666666666e-07, + "loss": 0.0001, + "num_tokens": 2329471.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 146.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0389113612473011, + "kl": 0.023123985156416893, + "learning_rate": 6.963333333333333e-07, + "loss": 0.0012, + "num_tokens": 2329745.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 146.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05154503136873245, + "kl": 0.04030958376824856, + "learning_rate": 6.960000000000001e-07, + "loss": 0.002, + "num_tokens": 2330056.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 146.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01767592318356037, + "kl": 0.0008933462668210268, + "learning_rate": 6.956666666666667e-07, + "loss": 0.0, + "num_tokens": 2330332.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 146.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036674413830041885, + "kl": 0.07475689984858036, + "learning_rate": 6.953333333333333e-07, + "loss": 0.0038, + "num_tokens": 2330702.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 146.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008082011714577675, + "kl": 0.0004647746682167053, + "learning_rate": 6.95e-07, + "loss": 0.0, + "num_tokens": 2330962.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 146.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02330547571182251, + "kl": 0.0010652343335095793, + "learning_rate": 6.946666666666667e-07, + "loss": 0.0, + "num_tokens": 2331178.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 146.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09168670326471329, + "kl": 0.019862588495016098, + "learning_rate": 6.943333333333334e-07, + "loss": 0.001, + "num_tokens": 2331475.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 146.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033646319061517715, + "kl": 0.0031009033555164933, + "learning_rate": 6.94e-07, + "loss": 0.0002, + "num_tokens": 2331749.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 146.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009794164448976517, + "kl": 0.00040640901715960354, + "learning_rate": 6.936666666666667e-07, + "loss": 0.0, + "num_tokens": 2332067.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 146.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022331416606903076, + "kl": 0.006107160821557045, + "learning_rate": 6.933333333333333e-07, + "loss": 0.0003, + "num_tokens": 2332335.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 146.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07560805231332779, + "kl": 0.017447875812649727, + "learning_rate": 6.930000000000001e-07, + "loss": 0.001, + "num_tokens": 2332617.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 146.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026410184800624847, + "kl": 0.007153031183406711, + "learning_rate": 6.926666666666666e-07, + "loss": 0.0003, + "num_tokens": 2332909.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 146.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007500403095036745, + "kl": 0.0003406302275834605, + "learning_rate": 6.923333333333333e-07, + "loss": 0.0, + "num_tokens": 2333158.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 146.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053788814693689346, + "kl": 0.04855903051793575, + "learning_rate": 6.92e-07, + "loss": 0.0024, + "num_tokens": 2333495.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 146.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0677071139216423, + "kl": 0.016340465284883976, + "learning_rate": 6.916666666666667e-07, + "loss": 0.0008, + "num_tokens": 2333819.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 146.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01156686432659626, + "kl": 0.0026587173342704773, + "learning_rate": 6.913333333333334e-07, + "loss": 0.0001, + "num_tokens": 2334035.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 146.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2613486349582672, + "kl": 0.0367953865788877, + "learning_rate": 6.91e-07, + "loss": 0.0018, + "num_tokens": 2334339.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 146.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03902868181467056, + "kl": 0.009209196548908949, + "learning_rate": 6.906666666666667e-07, + "loss": 0.0005, + "num_tokens": 2334670.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 146.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05344216525554657, + "kl": 0.011436731845606118, + "learning_rate": 6.903333333333333e-07, + "loss": 0.0006, + "num_tokens": 2334957.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 146.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050553612411022186, + "kl": 0.008439704310148954, + "learning_rate": 6.900000000000001e-07, + "loss": 0.0004, + "num_tokens": 2335259.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 146.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0030578509904444218, + "kl": 0.0001486109395045787, + "learning_rate": 6.896666666666667e-07, + "loss": 0.0, + "num_tokens": 2335571.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 42.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 42.0, + "completions/mean_terminated_length": 42.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 146.90740740740742, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7461144924163818, + "kl": 0.11521635204553604, + "learning_rate": 6.893333333333333e-07, + "loss": 0.0369, + "num_tokens": 2335955.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 7933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.25, + "completions/mean_terminated_length": 6.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 146.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01660105399787426, + "kl": 0.0004939181380905211, + "learning_rate": 6.89e-07, + "loss": 0.0, + "num_tokens": 2336188.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 146.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00032070622546598315, + "kl": 8.128583431243896e-06, + "learning_rate": 6.886666666666668e-07, + "loss": 0.0, + "num_tokens": 2336408.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 146.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055148590356111526, + "kl": 0.008990629576146603, + "learning_rate": 6.883333333333333e-07, + "loss": 0.0004, + "num_tokens": 2336720.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 146.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12067971378564835, + "kl": 0.033998752012848854, + "learning_rate": 6.88e-07, + "loss": 0.0018, + "num_tokens": 2337008.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 147.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2857086658477783, + "kl": 0.02930644527077675, + "learning_rate": 6.876666666666667e-07, + "loss": 0.0015, + "num_tokens": 2337290.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 147.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03171192482113838, + "kl": 0.0007163698101066984, + "learning_rate": 6.873333333333333e-07, + "loss": 0.0, + "num_tokens": 2337547.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 147.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10451661795377731, + "kl": 0.024647328886203468, + "learning_rate": 6.87e-07, + "loss": 0.0012, + "num_tokens": 2337837.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 147.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17091840505599976, + "kl": 0.008172134403139353, + "learning_rate": 6.866666666666667e-07, + "loss": 0.0006, + "num_tokens": 2338064.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 147.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3531803786754608, + "kl": 0.01787441223859787, + "learning_rate": 6.863333333333333e-07, + "loss": 0.0012, + "num_tokens": 2338318.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 147.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030056845396757126, + "kl": 0.006302210036665201, + "learning_rate": 6.86e-07, + "loss": 0.0003, + "num_tokens": 2338586.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 147.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00673312321305275, + "kl": 0.163905531167984, + "learning_rate": 6.856666666666668e-07, + "loss": 0.0082, + "num_tokens": 2338894.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 147.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22499795258045197, + "kl": 0.01883025059942156, + "learning_rate": 6.853333333333333e-07, + "loss": 0.001, + "num_tokens": 2339192.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 147.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1145038977265358, + "kl": 0.022760297171771526, + "learning_rate": 6.85e-07, + "loss": 0.0011, + "num_tokens": 2339492.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 147.16666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6732354164123535, + "kl": 0.6013218630105257, + "learning_rate": 6.846666666666667e-07, + "loss": 0.0609, + "num_tokens": 2339831.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 7947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 37.0, + "completions/mean_terminated_length": 37.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 147.1851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6160852909088135, + "kl": 0.12218927592039108, + "learning_rate": 6.843333333333334e-07, + "loss": -0.0094, + "num_tokens": 2340195.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 7948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 147.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00030729122227057815, + "kl": 6.951391696929932e-06, + "learning_rate": 6.84e-07, + "loss": 0.0, + "num_tokens": 2340415.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 147.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13470609486103058, + "kl": 0.013342326506972313, + "learning_rate": 6.836666666666667e-07, + "loss": 0.0008, + "num_tokens": 2340675.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 147.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028726885095238686, + "kl": 0.001210954214911908, + "learning_rate": 6.833333333333333e-07, + "loss": 0.0001, + "num_tokens": 2340945.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 147.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028139915317296982, + "kl": 0.00025150924921035767, + "learning_rate": 6.83e-07, + "loss": 0.0, + "num_tokens": 2341157.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 147.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06457463651895523, + "kl": 0.006834480445832014, + "learning_rate": 6.826666666666667e-07, + "loss": 0.0003, + "num_tokens": 2341455.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 147.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10724636167287827, + "kl": 0.015549950301647186, + "learning_rate": 6.823333333333333e-07, + "loss": 0.0008, + "num_tokens": 2341731.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 147.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009234708733856678, + "kl": 0.0004986152052879333, + "learning_rate": 6.82e-07, + "loss": 0.0, + "num_tokens": 2341991.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 147.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05288499593734741, + "kl": 0.011790297867264599, + "learning_rate": 6.816666666666667e-07, + "loss": 0.0006, + "num_tokens": 2342310.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 147.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005728833493776619, + "kl": 0.0012573727872222662, + "learning_rate": 6.813333333333333e-07, + "loss": 0.0001, + "num_tokens": 2342590.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 147.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007495522731915116, + "kl": 0.00378631055355072, + "learning_rate": 6.81e-07, + "loss": 0.0002, + "num_tokens": 2342826.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 147.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02170245349407196, + "kl": 0.0016181372193386778, + "learning_rate": 6.806666666666667e-07, + "loss": 0.0001, + "num_tokens": 2343149.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 12.75, + "completions/mean_terminated_length": 12.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 147.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.252466082572937, + "kl": 0.02959541231393814, + "learning_rate": 6.803333333333334e-07, + "loss": 0.0023, + "num_tokens": 2343412.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 147.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027295619249343872, + "kl": 0.000490233302116394, + "learning_rate": 6.8e-07, + "loss": 0.0, + "num_tokens": 2343622.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 147.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11848331242799759, + "kl": 0.014552897773683071, + "learning_rate": 6.796666666666667e-07, + "loss": 0.0008, + "num_tokens": 2343898.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 147.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029056072235107422, + "kl": 0.003153599624056369, + "learning_rate": 6.793333333333334e-07, + "loss": 0.0001, + "num_tokens": 2344164.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 147.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008045547641813755, + "kl": 0.0008295339066535234, + "learning_rate": 6.79e-07, + "loss": 0.0, + "num_tokens": 2344448.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 147.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03112046979367733, + "kl": 0.044505782425403595, + "learning_rate": 6.786666666666667e-07, + "loss": 0.0022, + "num_tokens": 2344852.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 7965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 40.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 40.5, + "completions/mean_terminated_length": 40.5, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 147.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16668641567230225, + "kl": 0.08013206720352173, + "learning_rate": 6.783333333333334e-07, + "loss": 0.004, + "num_tokens": 2345242.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 147.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0830543041229248, + "kl": 0.015461879782378674, + "learning_rate": 6.78e-07, + "loss": 0.0008, + "num_tokens": 2345571.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 34.25, + "completions/mean_terminated_length": 34.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 147.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03035755455493927, + "kl": 0.0059231220511719584, + "learning_rate": 6.776666666666667e-07, + "loss": 0.0004, + "num_tokens": 2345928.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 7.5, + "completions/mean_terminated_length": 7.5, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 147.57407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.366961479187012, + "kl": 0.04303776248707436, + "learning_rate": 6.773333333333334e-07, + "loss": 0.2485, + "num_tokens": 2346158.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 7969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 147.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06187133118510246, + "kl": 0.0033652736456133425, + "learning_rate": 6.769999999999999e-07, + "loss": 0.0002, + "num_tokens": 2346480.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 147.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013169502839446068, + "kl": 0.0004002231962658698, + "learning_rate": 6.766666666666667e-07, + "loss": 0.0, + "num_tokens": 2346714.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 147.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01761413738131523, + "kl": 0.003219752514269203, + "learning_rate": 6.763333333333334e-07, + "loss": 0.0002, + "num_tokens": 2347003.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 147.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045347876846790314, + "kl": 0.00628051976673305, + "learning_rate": 6.76e-07, + "loss": 0.0003, + "num_tokens": 2347295.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 147.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11145929992198944, + "kl": 0.0370666328817606, + "learning_rate": 6.756666666666666e-07, + "loss": 0.0019, + "num_tokens": 2347632.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 147.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026025963947176933, + "kl": 0.009753241203725338, + "learning_rate": 6.753333333333334e-07, + "loss": 0.0005, + "num_tokens": 2347918.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 147.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0303365346044302, + "kl": 0.0010238439062959515, + "learning_rate": 6.75e-07, + "loss": 0.0001, + "num_tokens": 2348224.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 147.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016308927908539772, + "kl": 0.2659081071615219, + "learning_rate": 6.746666666666667e-07, + "loss": 0.0133, + "num_tokens": 2348528.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 147.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01003574300557375, + "kl": 0.00020775644952664152, + "learning_rate": 6.743333333333333e-07, + "loss": 0.0, + "num_tokens": 2348798.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 32.25, + "completions/mean_terminated_length": 32.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 147.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0372542105615139, + "kl": 0.00859007053077221, + "learning_rate": 6.74e-07, + "loss": 0.0004, + "num_tokens": 2349155.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.75, + "completions/mean_terminated_length": 23.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 147.77777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.595075607299805, + "kl": 0.04293276369571686, + "learning_rate": 6.736666666666667e-07, + "loss": -0.0044, + "num_tokens": 2349470.0, + "reward": 2.0, + "reward_std": 2.4494898319244385, + "rewards/reward_combined/mean": 2.0, + "rewards/reward_combined/std": 2.4494898319244385, + "step": 7980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 147.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012321427464485168, + "kl": 0.00736632477492094, + "learning_rate": 6.733333333333334e-07, + "loss": 0.0004, + "num_tokens": 2349742.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 147.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005781739484518766, + "kl": 0.00030984529439592734, + "learning_rate": 6.73e-07, + "loss": 0.0, + "num_tokens": 2350004.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 147.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13076011836528778, + "kl": 0.014736629091203213, + "learning_rate": 6.726666666666666e-07, + "loss": 0.0007, + "num_tokens": 2350299.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 147.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022307464852929115, + "kl": 0.003343239426612854, + "learning_rate": 6.723333333333334e-07, + "loss": 0.0002, + "num_tokens": 2350572.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 147.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07655911147594452, + "kl": 0.004491940140724182, + "learning_rate": 6.72e-07, + "loss": 0.0002, + "num_tokens": 2350840.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 147.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0214406568557024, + "kl": 0.01185589050874114, + "learning_rate": 6.716666666666667e-07, + "loss": 0.0006, + "num_tokens": 2351100.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 147.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5018789768218994, + "kl": 0.1297360584139824, + "learning_rate": 6.713333333333333e-07, + "loss": 0.0063, + "num_tokens": 2351414.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 39.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 39.25, + "completions/mean_terminated_length": 39.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 147.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06903313845396042, + "kl": 0.03757285699248314, + "learning_rate": 6.71e-07, + "loss": 0.0021, + "num_tokens": 2351795.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 7988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3.0, + "completions/max_terminated_length": 3.0, + "completions/mean_length": 2.25, + "completions/mean_terminated_length": 2.25, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 147.94444444444446, + "frac_reward_zero_std": 0.0, + "grad_norm": 21.11982536315918, + "kl": 0.12326832115650177, + "learning_rate": 6.706666666666667e-07, + "loss": 0.0888, + "num_tokens": 2352008.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 7989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 147.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008687658235430717, + "kl": 0.09840095788240433, + "learning_rate": 6.703333333333334e-07, + "loss": 0.0049, + "num_tokens": 2352380.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 147.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.0136325359344482, + "kl": 0.23587112640962005, + "learning_rate": 6.7e-07, + "loss": 0.0135, + "num_tokens": 2352711.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 148.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059992823749780655, + "kl": 0.009034299524500966, + "learning_rate": 6.696666666666666e-07, + "loss": 0.0005, + "num_tokens": 2353025.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 148.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013757170177996159, + "kl": 0.00013109147403156385, + "learning_rate": 6.693333333333334e-07, + "loss": 0.0, + "num_tokens": 2353281.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 7993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 33.25, + "completions/mean_terminated_length": 33.25, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 148.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5506874918937683, + "kl": 0.15094274282455444, + "learning_rate": 6.690000000000001e-07, + "loss": 0.007, + "num_tokens": 2353630.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 148.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027551544830203056, + "kl": 0.002665444160811603, + "learning_rate": 6.686666666666666e-07, + "loss": 0.0001, + "num_tokens": 2353903.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 148.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07488074153661728, + "kl": 0.03005118388682604, + "learning_rate": 6.683333333333333e-07, + "loss": 0.0015, + "num_tokens": 2354175.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 148.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05191029980778694, + "kl": 0.0031926408410072327, + "learning_rate": 6.680000000000001e-07, + "loss": 0.0002, + "num_tokens": 2354419.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 148.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008011865429580212, + "kl": 0.0037679076194763184, + "learning_rate": 6.676666666666667e-07, + "loss": 0.0002, + "num_tokens": 2354655.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 7998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 148.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009269864298403263, + "kl": 0.000871806318173185, + "learning_rate": 6.673333333333334e-07, + "loss": 0.0, + "num_tokens": 2354937.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 7999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 148.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006255480460822582, + "kl": 0.0004456430615391582, + "learning_rate": 6.67e-07, + "loss": 0.0, + "num_tokens": 2355197.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 35.5, + "completions/mean_terminated_length": 35.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 148.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032542139291763306, + "kl": 0.027465634047985077, + "learning_rate": 6.666666666666666e-07, + "loss": 0.0014, + "num_tokens": 2355563.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 27.0, + "completions/mean_terminated_length": 27.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 148.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05432083085179329, + "kl": 0.013635757379233837, + "learning_rate": 6.663333333333334e-07, + "loss": 0.0007, + "num_tokens": 2355899.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 12.75, + "completions/mean_terminated_length": 12.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 148.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18009230494499207, + "kl": 0.017343849409371614, + "learning_rate": 6.660000000000001e-07, + "loss": 0.001, + "num_tokens": 2356170.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 148.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035313963890075684, + "kl": 0.01275394344702363, + "learning_rate": 6.656666666666666e-07, + "loss": 0.0007, + "num_tokens": 2356444.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 148.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03571133315563202, + "kl": 0.003046083264052868, + "learning_rate": 6.653333333333333e-07, + "loss": 0.0002, + "num_tokens": 2356756.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 148.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0070494115352630615, + "kl": 0.0003920156304957345, + "learning_rate": 6.650000000000001e-07, + "loss": 0.0, + "num_tokens": 2357075.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 148.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02196187898516655, + "kl": 0.0010163102997466922, + "learning_rate": 6.646666666666667e-07, + "loss": 0.0001, + "num_tokens": 2357355.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 148.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021372133865952492, + "kl": 0.011771119199693203, + "learning_rate": 6.643333333333333e-07, + "loss": 0.0006, + "num_tokens": 2357615.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 148.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7622270584106445, + "kl": 0.12751147523522377, + "learning_rate": 6.64e-07, + "loss": 0.0067, + "num_tokens": 2357907.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 148.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028814680874347687, + "kl": 0.006982567410886986, + "learning_rate": 6.636666666666667e-07, + "loss": 0.0003, + "num_tokens": 2358179.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 148.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10555873811244965, + "kl": 0.0092735611833632, + "learning_rate": 6.633333333333334e-07, + "loss": 0.0005, + "num_tokens": 2358479.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 148.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034490421414375305, + "kl": 0.009673627093434334, + "learning_rate": 6.63e-07, + "loss": 0.0005, + "num_tokens": 2358782.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 148.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02914639189839363, + "kl": 0.0002091825008392334, + "learning_rate": 6.626666666666666e-07, + "loss": 0.0, + "num_tokens": 2358994.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 148.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010764111764729023, + "kl": 0.006849497323855758, + "learning_rate": 6.623333333333333e-07, + "loss": 0.0003, + "num_tokens": 2359286.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 148.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009475374594330788, + "kl": 0.09832324832677841, + "learning_rate": 6.620000000000001e-07, + "loss": 0.0049, + "num_tokens": 2359658.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 148.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027653368189930916, + "kl": 0.0018219202756881714, + "learning_rate": 6.616666666666666e-07, + "loss": 0.0001, + "num_tokens": 2359870.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 148.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17658135294914246, + "kl": 0.011822124011814594, + "learning_rate": 6.613333333333333e-07, + "loss": 0.0007, + "num_tokens": 2360089.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 148.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11850176751613617, + "kl": 0.015849125338718295, + "learning_rate": 6.61e-07, + "loss": 0.0008, + "num_tokens": 2360413.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 148.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.222862482070923, + "kl": 0.07072535157203674, + "learning_rate": 6.606666666666667e-07, + "loss": -0.025, + "num_tokens": 2360743.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 8019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 148.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10633479803800583, + "kl": 0.016889100894331932, + "learning_rate": 6.603333333333334e-07, + "loss": 0.0007, + "num_tokens": 2361068.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 148.53703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.8403096199035645, + "kl": 0.10900132835377008, + "learning_rate": 6.6e-07, + "loss": 0.2219, + "num_tokens": 2361369.0, + "reward": 6.125, + "reward_std": 3.75, + "rewards/reward_combined/mean": 6.125, + "rewards/reward_combined/std": 3.75, + "step": 8021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 148.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04380648210644722, + "kl": 0.1606292575597763, + "learning_rate": 6.596666666666667e-07, + "loss": 0.008, + "num_tokens": 2361680.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 148.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019948706030845642, + "kl": 0.00022693723440170288, + "learning_rate": 6.593333333333333e-07, + "loss": 0.0, + "num_tokens": 2361888.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 148.59259259259258, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.14555025100708, + "kl": 0.10182899609208107, + "learning_rate": 6.590000000000001e-07, + "loss": 0.2441, + "num_tokens": 2362198.0, + "reward": 5.875, + "reward_std": 3.25, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 3.25, + "step": 8024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 148.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7454323768615723, + "kl": 0.015445174183696508, + "learning_rate": 6.586666666666667e-07, + "loss": -0.0016, + "num_tokens": 2362532.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 8025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 148.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04945673793554306, + "kl": 0.001501104183262214, + "learning_rate": 6.583333333333333e-07, + "loss": 0.0001, + "num_tokens": 2362751.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 148.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05545986443758011, + "kl": 0.001404590904712677, + "learning_rate": 6.58e-07, + "loss": 0.0001, + "num_tokens": 2362971.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 8027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 148.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08915232867002487, + "kl": 0.014578057453036308, + "learning_rate": 6.576666666666668e-07, + "loss": 0.0007, + "num_tokens": 2363249.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 32.25, + "completions/mean_terminated_length": 32.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 148.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056035757064819336, + "kl": 0.04754984565079212, + "learning_rate": 6.573333333333333e-07, + "loss": 0.0024, + "num_tokens": 2363658.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 8029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 82.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 82.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 148.7037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.023529052734375, + "kl": 0.027549312449991703, + "learning_rate": 6.57e-07, + "loss": 0.418, + "num_tokens": 2364206.0, + "reward": 3.049999952316284, + "reward_std": 1.899999976158142, + "rewards/reward_combined/mean": 3.049999952316284, + "rewards/reward_combined/std": 1.899999976158142, + "step": 8030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 148.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026580994948744774, + "kl": 0.00195881724357605, + "learning_rate": 6.566666666666667e-07, + "loss": 0.0001, + "num_tokens": 2364466.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 7.0, + "completions/mean_terminated_length": 7.0, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 148.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019084108993411064, + "kl": 0.0006825370655860752, + "learning_rate": 6.563333333333333e-07, + "loss": 0.0, + "num_tokens": 2364702.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 148.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010667764581739902, + "kl": 0.0013720928691327572, + "learning_rate": 6.56e-07, + "loss": 0.0001, + "num_tokens": 2364972.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 148.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053804799914360046, + "kl": 0.026516889221966267, + "learning_rate": 6.556666666666667e-07, + "loss": 0.0014, + "num_tokens": 2365261.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 148.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017637724056839943, + "kl": 0.26560617983341217, + "learning_rate": 6.553333333333333e-07, + "loss": 0.0133, + "num_tokens": 2365565.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 148.8148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3374927043914795, + "kl": 0.010917457402683794, + "learning_rate": 6.55e-07, + "loss": -0.0376, + "num_tokens": 2365867.0, + "reward": 7.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.875, + "rewards/reward_combined/std": 0.25, + "step": 8036 + }, + { + "clip_ratio/high_max": 0.007936508394777775, + "clip_ratio/high_mean": 0.007936508394777775, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007936508394777775, + "completion_length": 34.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 34.5, + "completions/mean_terminated_length": 34.5, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 148.83333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1461198329925537, + "kl": 0.2187935635447502, + "learning_rate": 6.546666666666668e-07, + "loss": -0.0485, + "num_tokens": 2366233.0, + "reward": 7.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.875, + "rewards/reward_combined/std": 0.25, + "step": 8037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 27.25, + "completions/mean_terminated_length": 27.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 148.85185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.414330244064331, + "kl": 0.14570897817611694, + "learning_rate": 6.543333333333333e-07, + "loss": -0.0005, + "num_tokens": 2366578.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 8038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 148.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015954479575157166, + "kl": 0.00042009057870018296, + "learning_rate": 6.54e-07, + "loss": 0.0, + "num_tokens": 2366887.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 148.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07917128503322601, + "kl": 0.03889700584113598, + "learning_rate": 6.536666666666667e-07, + "loss": 0.0019, + "num_tokens": 2367215.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 23.75, + "completions/mean_terminated_length": 23.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 148.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18024809658527374, + "kl": 0.02083851397037506, + "learning_rate": 6.533333333333334e-07, + "loss": 0.0011, + "num_tokens": 2367558.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 148.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09989302605390549, + "kl": 0.008720326703041792, + "learning_rate": 6.53e-07, + "loss": 0.0004, + "num_tokens": 2367824.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 148.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034171611070632935, + "kl": 0.0009974651038646698, + "learning_rate": 6.526666666666667e-07, + "loss": 0.0, + "num_tokens": 2368084.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 148.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5776098370552063, + "kl": 0.06068794883321971, + "learning_rate": 6.523333333333333e-07, + "loss": 0.0036, + "num_tokens": 2368380.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 148.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02446579746901989, + "kl": 0.0026450157165527344, + "learning_rate": 6.52e-07, + "loss": 0.0001, + "num_tokens": 2368712.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 149.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8118560314178467, + "kl": 0.019750438630580902, + "learning_rate": 6.516666666666667e-07, + "loss": -0.0311, + "num_tokens": 2369005.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 8046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.75, + "completions/mean_terminated_length": 10.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 149.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07364393770694733, + "kl": 0.0026061697863042355, + "learning_rate": 6.513333333333333e-07, + "loss": 0.0001, + "num_tokens": 2369276.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 149.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026624208316206932, + "kl": 0.012482813559472561, + "learning_rate": 6.51e-07, + "loss": 0.0007, + "num_tokens": 2369550.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 149.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06158352643251419, + "kl": 0.0038567623123526573, + "learning_rate": 6.506666666666667e-07, + "loss": 0.0002, + "num_tokens": 2369815.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 149.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1643344610929489, + "kl": 0.025559797883033752, + "learning_rate": 6.503333333333333e-07, + "loss": 0.0014, + "num_tokens": 2370094.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 149.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012544273398816586, + "kl": 0.0007855668663978577, + "learning_rate": 6.5e-07, + "loss": 0.0, + "num_tokens": 2370302.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 149.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010793453082442284, + "kl": 0.0006618913030251861, + "learning_rate": 6.496666666666667e-07, + "loss": 0.0, + "num_tokens": 2370624.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 149.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09990867972373962, + "kl": 0.2681703567504883, + "learning_rate": 6.493333333333334e-07, + "loss": 0.0134, + "num_tokens": 2370928.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 149.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013178651221096516, + "kl": 0.00012609064651769586, + "learning_rate": 6.49e-07, + "loss": 0.0, + "num_tokens": 2371184.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 149.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04322998970746994, + "kl": 0.04183654114603996, + "learning_rate": 6.486666666666667e-07, + "loss": 0.0021, + "num_tokens": 2371588.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 8055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 149.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003713405691087246, + "kl": 0.0002658894009073265, + "learning_rate": 6.483333333333334e-07, + "loss": 0.0, + "num_tokens": 2371831.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 149.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.5020883083343506, + "kl": 0.23826029431074858, + "learning_rate": 6.48e-07, + "loss": 0.0126, + "num_tokens": 2372122.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 149.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01584302820265293, + "kl": 0.15759839117527008, + "learning_rate": 6.476666666666667e-07, + "loss": 0.0079, + "num_tokens": 2372433.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 149.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0945601835846901, + "kl": 0.022564067505300045, + "learning_rate": 6.473333333333334e-07, + "loss": 0.0011, + "num_tokens": 2372740.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 149.25925925925927, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1706409454345703, + "kl": 0.010000812355428934, + "learning_rate": 6.47e-07, + "loss": -0.0933, + "num_tokens": 2373018.0, + "reward": 7.625, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.625, + "rewards/reward_combined/std": 0.25, + "step": 8060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 149.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026172997429966927, + "kl": 0.0016213970957323909, + "learning_rate": 6.466666666666667e-07, + "loss": 0.0001, + "num_tokens": 2373290.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 40.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 45.0, + "completions/max_terminated_length": 45.0, + "completions/mean_length": 40.5, + "completions/mean_terminated_length": 40.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 149.2962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.560152053833008, + "kl": 0.05466253496706486, + "learning_rate": 6.463333333333334e-07, + "loss": 0.0258, + "num_tokens": 2373676.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 8062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 149.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016302743926644325, + "kl": 0.0007113851606845856, + "learning_rate": 6.459999999999999e-07, + "loss": 0.0, + "num_tokens": 2373968.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 34.0, + "completions/mean_terminated_length": 34.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 149.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025030776858329773, + "kl": 0.012033788254484534, + "learning_rate": 6.456666666666667e-07, + "loss": 0.0005, + "num_tokens": 2374324.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 149.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031675782054662704, + "kl": 0.003703831462189555, + "learning_rate": 6.453333333333334e-07, + "loss": 0.0002, + "num_tokens": 2374656.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 149.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026614150032401085, + "kl": 0.0017110109329223633, + "learning_rate": 6.45e-07, + "loss": 0.0001, + "num_tokens": 2374868.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 149.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003367283206898719, + "kl": 7.336835187743418e-05, + "learning_rate": 6.446666666666666e-07, + "loss": 0.0, + "num_tokens": 2375180.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 149.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027076590806245804, + "kl": 0.0008286722004413605, + "learning_rate": 6.443333333333334e-07, + "loss": 0.0, + "num_tokens": 2375440.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 149.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03575679659843445, + "kl": 0.008242711424827576, + "learning_rate": 6.44e-07, + "loss": 0.0004, + "num_tokens": 2375792.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 149.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03205917775630951, + "kl": 0.00024272501468658447, + "learning_rate": 6.436666666666667e-07, + "loss": 0.0, + "num_tokens": 2376004.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 149.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12120383232831955, + "kl": 0.02965997252613306, + "learning_rate": 6.433333333333334e-07, + "loss": 0.0014, + "num_tokens": 2376328.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 149.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010393207194283605, + "kl": 0.001351326471194625, + "learning_rate": 6.43e-07, + "loss": 0.0001, + "num_tokens": 2376605.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 38.25, + "completions/mean_terminated_length": 38.25, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 149.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03497472032904625, + "kl": 0.05707773193717003, + "learning_rate": 6.426666666666667e-07, + "loss": 0.0029, + "num_tokens": 2376986.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 149.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09369508177042007, + "kl": 0.005586998537182808, + "learning_rate": 6.423333333333334e-07, + "loss": 0.0003, + "num_tokens": 2377247.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 34.25, + "completions/mean_terminated_length": 34.25, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 149.53703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6338117122650146, + "kl": 0.14234613627195358, + "learning_rate": 6.42e-07, + "loss": -0.0622, + "num_tokens": 2377600.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 8075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 149.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020768476650118828, + "kl": 0.0062638719828100875, + "learning_rate": 6.416666666666666e-07, + "loss": 0.0003, + "num_tokens": 2377872.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.5, + "completions/mean_terminated_length": 4.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 149.57407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.97605037689209, + "kl": 1.336567960679531, + "learning_rate": 6.413333333333334e-07, + "loss": 0.164, + "num_tokens": 2378110.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 8077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 149.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02859625220298767, + "kl": 0.10031475871801376, + "learning_rate": 6.41e-07, + "loss": 0.005, + "num_tokens": 2378482.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 149.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009700460359454155, + "kl": 0.0005378978530643508, + "learning_rate": 6.406666666666667e-07, + "loss": 0.0, + "num_tokens": 2378717.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 149.62962962962962, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.086411952972412, + "kl": 0.04046951234340668, + "learning_rate": 6.403333333333333e-07, + "loss": 0.0106, + "num_tokens": 2379073.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 8080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 149.64814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.591157913208008, + "kl": 0.07929861824959517, + "learning_rate": 6.4e-07, + "loss": -0.0576, + "num_tokens": 2379353.0, + "reward": 7.625, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.625, + "rewards/reward_combined/std": 0.25, + "step": 8081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 149.66666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.724581003189087, + "kl": 0.03500372124835849, + "learning_rate": 6.396666666666667e-07, + "loss": -0.0317, + "num_tokens": 2379653.0, + "reward": 7.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.875, + "rewards/reward_combined/std": 0.25, + "step": 8082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 149.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036604177206754684, + "kl": 0.005146813113242388, + "learning_rate": 6.393333333333334e-07, + "loss": 0.0003, + "num_tokens": 2379989.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 26.25, + "completions/mean_terminated_length": 26.25, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 149.7037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546838402748108, + "kl": 0.09196072816848755, + "learning_rate": 6.39e-07, + "loss": -0.0941, + "num_tokens": 2380330.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 8084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 149.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02282535843551159, + "kl": 0.011365019716322422, + "learning_rate": 6.386666666666666e-07, + "loss": 0.0006, + "num_tokens": 2380590.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.75, + "completions/mean_terminated_length": 5.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 149.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0218175258487463, + "kl": 0.0015938090509735048, + "learning_rate": 6.383333333333334e-07, + "loss": 0.0001, + "num_tokens": 2380813.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 149.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014930716715753078, + "kl": 0.006839309353381395, + "learning_rate": 6.380000000000001e-07, + "loss": 0.0003, + "num_tokens": 2381102.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 149.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07219650596380234, + "kl": 0.014264614321291447, + "learning_rate": 6.376666666666666e-07, + "loss": 0.0007, + "num_tokens": 2381431.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 149.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06212437152862549, + "kl": 0.011779449065215886, + "learning_rate": 6.373333333333333e-07, + "loss": 0.0006, + "num_tokens": 2381713.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 149.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06987587362527847, + "kl": 0.004480735864490271, + "learning_rate": 6.370000000000001e-07, + "loss": 0.0002, + "num_tokens": 2381985.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 149.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03254120796918869, + "kl": 0.007599015021696687, + "learning_rate": 6.366666666666667e-07, + "loss": 0.0004, + "num_tokens": 2382278.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 149.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07763347774744034, + "kl": 0.04218504764139652, + "learning_rate": 6.363333333333334e-07, + "loss": 0.0022, + "num_tokens": 2382568.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 149.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002058211830444634, + "kl": 4.693865776062012e-06, + "learning_rate": 6.36e-07, + "loss": 0.0, + "num_tokens": 2382788.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 8093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 149.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16519120335578918, + "kl": 0.03383318521082401, + "learning_rate": 6.356666666666666e-07, + "loss": 0.0016, + "num_tokens": 2383112.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 9.75, + "completions/mean_terminated_length": 9.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 149.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19964449107646942, + "kl": 0.011453303974121809, + "learning_rate": 6.353333333333334e-07, + "loss": 0.0006, + "num_tokens": 2383351.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 149.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00947615411132574, + "kl": 0.0016359619330614805, + "learning_rate": 6.350000000000001e-07, + "loss": 0.0001, + "num_tokens": 2383647.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 149.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0793096050620079, + "kl": 0.012051485944539309, + "learning_rate": 6.346666666666666e-07, + "loss": 0.0006, + "num_tokens": 2383927.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 149.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2394258677959442, + "kl": 0.03626616485416889, + "learning_rate": 6.343333333333333e-07, + "loss": 0.0016, + "num_tokens": 2384244.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 149.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014735878445208073, + "kl": 0.0004193728236714378, + "learning_rate": 6.340000000000001e-07, + "loss": 0.0, + "num_tokens": 2384506.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 150.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02108786255121231, + "kl": 0.0026528770104050636, + "learning_rate": 6.336666666666667e-07, + "loss": 0.0001, + "num_tokens": 2384818.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 150.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07298009097576141, + "kl": 0.041613128036260605, + "learning_rate": 6.333333333333333e-07, + "loss": 0.0021, + "num_tokens": 2385116.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 150.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04171275720000267, + "kl": 0.0011490675387904048, + "learning_rate": 6.33e-07, + "loss": 0.0001, + "num_tokens": 2385373.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 150.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4373611807823181, + "kl": 0.03042233525775373, + "learning_rate": 6.326666666666667e-07, + "loss": 0.0016, + "num_tokens": 2385643.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 38.0, + "completions/mean_terminated_length": 38.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 150.07407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1503992080688477, + "kl": 0.0628511905670166, + "learning_rate": 6.323333333333334e-07, + "loss": 0.2016, + "num_tokens": 2386031.0, + "reward": 5.25, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 4.5, + "step": 8104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 11.75, + "completions/mean_terminated_length": 11.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 150.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06314212083816528, + "kl": 0.007682716008275747, + "learning_rate": 6.32e-07, + "loss": 0.0005, + "num_tokens": 2386298.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 150.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0036239244509488344, + "kl": 0.0002377443015575409, + "learning_rate": 6.316666666666666e-07, + "loss": 0.0, + "num_tokens": 2386541.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 150.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09184882044792175, + "kl": 0.014567125719622709, + "learning_rate": 6.313333333333333e-07, + "loss": 0.0006, + "num_tokens": 2386862.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 150.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1954093873500824, + "kl": 0.04638131766114384, + "learning_rate": 6.310000000000001e-07, + "loss": 0.0026, + "num_tokens": 2387161.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 150.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02628750540316105, + "kl": 0.001577138900756836, + "learning_rate": 6.306666666666666e-07, + "loss": 0.0001, + "num_tokens": 2387373.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 150.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004877230618149042, + "kl": 0.00024380088143516332, + "learning_rate": 6.303333333333333e-07, + "loss": 0.0, + "num_tokens": 2387593.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 150.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016999825835227966, + "kl": 0.0036536535844788887, + "learning_rate": 6.3e-07, + "loss": 0.0002, + "num_tokens": 2387853.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 150.22222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.579219341278076, + "kl": 0.029781543475110084, + "learning_rate": 6.296666666666667e-07, + "loss": 0.1949, + "num_tokens": 2388126.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 8112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 150.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0728498324751854, + "kl": 0.018589243292808533, + "learning_rate": 6.293333333333334e-07, + "loss": 0.0009, + "num_tokens": 2388398.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 150.25925925925927, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.9928205013275146, + "kl": 0.024965515360236168, + "learning_rate": 6.29e-07, + "loss": 0.1113, + "num_tokens": 2388741.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 8114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 150.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017191950231790543, + "kl": 0.04064549319446087, + "learning_rate": 6.286666666666667e-07, + "loss": 0.002, + "num_tokens": 2389146.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 8115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 150.2962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.48533296585083, + "kl": 0.03565186820924282, + "learning_rate": 6.283333333333333e-07, + "loss": 0.0324, + "num_tokens": 2389447.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 8116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 150.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05955472216010094, + "kl": 0.020073309540748596, + "learning_rate": 6.280000000000001e-07, + "loss": 0.001, + "num_tokens": 2389751.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 150.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033035438507795334, + "kl": 0.01324456837028265, + "learning_rate": 6.276666666666667e-07, + "loss": 0.0007, + "num_tokens": 2390055.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 150.35185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.610957145690918, + "kl": 1.340236946940422, + "learning_rate": 6.273333333333333e-07, + "loss": 0.0463, + "num_tokens": 2390425.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 8119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 150.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10300295054912567, + "kl": 0.012131822062656283, + "learning_rate": 6.27e-07, + "loss": 0.0006, + "num_tokens": 2390727.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 37.75, + "completions/mean_terminated_length": 37.75, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 150.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17439605295658112, + "kl": 0.06616406515240669, + "learning_rate": 6.266666666666668e-07, + "loss": 0.0033, + "num_tokens": 2391094.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 150.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015556970611214638, + "kl": 0.006047853392374236, + "learning_rate": 6.263333333333333e-07, + "loss": 0.0003, + "num_tokens": 2391362.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 150.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025696801021695137, + "kl": 0.012086418457329273, + "learning_rate": 6.26e-07, + "loss": 0.0007, + "num_tokens": 2391636.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 150.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005189818912185729, + "kl": 0.0012384653673507273, + "learning_rate": 6.256666666666667e-07, + "loss": 0.0001, + "num_tokens": 2391916.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 150.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02235209196805954, + "kl": 0.011526144109666348, + "learning_rate": 6.253333333333333e-07, + "loss": 0.0006, + "num_tokens": 2392176.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 150.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02855044975876808, + "kl": 0.009233035147190094, + "learning_rate": 6.25e-07, + "loss": 0.0005, + "num_tokens": 2392467.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 150.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018995990976691246, + "kl": 0.2653230279684067, + "learning_rate": 6.246666666666667e-07, + "loss": 0.0133, + "num_tokens": 2392771.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 150.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026809118688106537, + "kl": 0.0009299021621700376, + "learning_rate": 6.243333333333333e-07, + "loss": 0.0, + "num_tokens": 2393087.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 150.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0077546448446810246, + "kl": 0.009261199971660972, + "learning_rate": 6.24e-07, + "loss": 0.0004, + "num_tokens": 2393373.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 37.0, + "completions/mean_terminated_length": 37.0, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 150.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06505013257265091, + "kl": 0.058848634362220764, + "learning_rate": 6.236666666666668e-07, + "loss": 0.0029, + "num_tokens": 2393749.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 150.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027938757091760635, + "kl": 0.000877678394317627, + "learning_rate": 6.233333333333333e-07, + "loss": 0.0, + "num_tokens": 2394009.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 35.75, + "completions/mean_terminated_length": 35.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 150.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03115653619170189, + "kl": 0.032979780808091164, + "learning_rate": 6.23e-07, + "loss": 0.0017, + "num_tokens": 2394376.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 150.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05155746638774872, + "kl": 0.00849071890115738, + "learning_rate": 6.226666666666667e-07, + "loss": 0.0004, + "num_tokens": 2394658.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 150.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026812391355633736, + "kl": 0.0014806622930336744, + "learning_rate": 6.223333333333334e-07, + "loss": 0.0001, + "num_tokens": 2394938.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 32.5, + "completions/mean_terminated_length": 32.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 150.64814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3631722927093506, + "kl": 0.016186361317522824, + "learning_rate": 6.22e-07, + "loss": 0.2206, + "num_tokens": 2395312.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 8135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 150.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006051879376173019, + "kl": 0.1613881066441536, + "learning_rate": 6.216666666666667e-07, + "loss": 0.0081, + "num_tokens": 2395621.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 32.75, + "completions/mean_terminated_length": 32.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 150.6851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.970693588256836, + "kl": 0.017016710247844458, + "learning_rate": 6.213333333333333e-07, + "loss": 0.1097, + "num_tokens": 2395980.0, + "reward": 5.25, + "reward_std": 5.5, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 5.5, + "step": 8137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 150.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0967727899551392, + "kl": 0.14109086815733463, + "learning_rate": 6.21e-07, + "loss": 0.0076, + "num_tokens": 2396303.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 150.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029974566772580147, + "kl": 0.009257845114916563, + "learning_rate": 6.206666666666667e-07, + "loss": 0.0005, + "num_tokens": 2396625.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 150.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22037850320339203, + "kl": 0.026107670506462455, + "learning_rate": 6.203333333333333e-07, + "loss": 0.0013, + "num_tokens": 2396947.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 150.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011543075554072857, + "kl": 0.007443260634317994, + "learning_rate": 6.2e-07, + "loss": 0.0004, + "num_tokens": 2397219.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 150.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0023880486842244864, + "kl": 1.6517937183380127e-05, + "learning_rate": 6.196666666666667e-07, + "loss": 0.0, + "num_tokens": 2397431.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 150.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06675207614898682, + "kl": 0.009099230170249939, + "learning_rate": 6.193333333333333e-07, + "loss": 0.0005, + "num_tokens": 2397743.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.25, + "completions/mean_terminated_length": 6.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 150.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0447884164750576, + "kl": 0.0016662002162775025, + "learning_rate": 6.19e-07, + "loss": 0.0001, + "num_tokens": 2397976.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 150.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01102589163929224, + "kl": 0.0026854098541662097, + "learning_rate": 6.186666666666667e-07, + "loss": 0.0001, + "num_tokens": 2398242.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 150.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00029954116325825453, + "kl": 6.973743438720703e-06, + "learning_rate": 6.183333333333334e-07, + "loss": 0.0, + "num_tokens": 2398462.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 8146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 150.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021404866129159927, + "kl": 0.002805741038173437, + "learning_rate": 6.18e-07, + "loss": 0.0001, + "num_tokens": 2398752.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 150.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00955558568239212, + "kl": 0.0019166171550750732, + "learning_rate": 6.176666666666667e-07, + "loss": 0.0001, + "num_tokens": 2398968.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 150.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007257889024913311, + "kl": 0.0014499574899673462, + "learning_rate": 6.173333333333334e-07, + "loss": 0.0001, + "num_tokens": 2399264.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 150.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029709329828619957, + "kl": 0.0026041120290756226, + "learning_rate": 6.17e-07, + "loss": 0.0001, + "num_tokens": 2399538.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 150.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09406719356775284, + "kl": 0.03556140046566725, + "learning_rate": 6.166666666666667e-07, + "loss": 0.0017, + "num_tokens": 2399884.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 150.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034665584564208984, + "kl": 0.0019402316538617015, + "learning_rate": 6.163333333333334e-07, + "loss": 0.0001, + "num_tokens": 2400156.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 150.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009009314235299826, + "kl": 0.003742985427379608, + "learning_rate": 6.16e-07, + "loss": 0.0002, + "num_tokens": 2400392.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 151.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03838036581873894, + "kl": 0.0008770450949668884, + "learning_rate": 6.156666666666667e-07, + "loss": 0.0, + "num_tokens": 2400604.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 151.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00961221195757389, + "kl": 0.0018416047096252441, + "learning_rate": 6.153333333333334e-07, + "loss": 0.0001, + "num_tokens": 2400820.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 151.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01988982781767845, + "kl": 0.26517385244369507, + "learning_rate": 6.149999999999999e-07, + "loss": 0.0133, + "num_tokens": 2401124.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 151.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031023859977722168, + "kl": 0.004424812505021691, + "learning_rate": 6.146666666666667e-07, + "loss": 0.0002, + "num_tokens": 2401423.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 38.0, + "completions/mean_terminated_length": 38.0, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 151.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12773260474205017, + "kl": 0.06399892829358578, + "learning_rate": 6.143333333333334e-07, + "loss": 0.0032, + "num_tokens": 2401803.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 151.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04989417642354965, + "kl": 0.001959426503162831, + "learning_rate": 6.14e-07, + "loss": 0.0001, + "num_tokens": 2402067.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 151.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0021941643208265305, + "kl": 0.0004234723746776581, + "learning_rate": 6.136666666666666e-07, + "loss": 0.0, + "num_tokens": 2402327.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 151.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023062046617269516, + "kl": 0.006290606688708067, + "learning_rate": 6.133333333333334e-07, + "loss": 0.0003, + "num_tokens": 2402595.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 151.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020509045862127095, + "kl": 4.723668098449707e-06, + "learning_rate": 6.13e-07, + "loss": 0.0, + "num_tokens": 2402815.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 8162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 151.16666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3017526865005493, + "kl": 0.02648480422794819, + "learning_rate": 6.126666666666667e-07, + "loss": 0.0159, + "num_tokens": 2403105.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 8163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 151.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05675193667411804, + "kl": 0.01713305152952671, + "learning_rate": 6.123333333333334e-07, + "loss": 0.0009, + "num_tokens": 2403393.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 29.75, + "completions/mean_terminated_length": 29.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 151.2037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6757495403289795, + "kl": 0.019472898915410042, + "learning_rate": 6.12e-07, + "loss": 0.1566, + "num_tokens": 2403740.0, + "reward": 5.625, + "reward_std": 4.75, + "rewards/reward_combined/mean": 5.625, + "rewards/reward_combined/std": 4.75, + "step": 8165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 151.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014792782254517078, + "kl": 0.0007511275762226433, + "learning_rate": 6.116666666666667e-07, + "loss": 0.0, + "num_tokens": 2404060.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 151.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2755134105682373, + "kl": 0.0266859628027305, + "learning_rate": 6.113333333333334e-07, + "loss": 0.0015, + "num_tokens": 2404360.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 151.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3069378435611725, + "kl": 0.027982468833215535, + "learning_rate": 6.11e-07, + "loss": 0.0016, + "num_tokens": 2404655.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 151.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004815730266273022, + "kl": 0.0002308189868927002, + "learning_rate": 6.106666666666666e-07, + "loss": 0.0, + "num_tokens": 2404875.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 151.2962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.221113681793213, + "kl": 0.059925079345703125, + "learning_rate": 6.103333333333334e-07, + "loss": 0.0693, + "num_tokens": 2405174.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 8170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 151.3148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.526832103729248, + "kl": 0.2515314519405365, + "learning_rate": 6.1e-07, + "loss": -0.0296, + "num_tokens": 2405578.0, + "reward": 2.375, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 1.25, + "step": 8171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 151.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0165316890925169, + "kl": 0.0011796177714131773, + "learning_rate": 6.096666666666667e-07, + "loss": 0.0001, + "num_tokens": 2405854.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 151.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15650227665901184, + "kl": 0.03975836560130119, + "learning_rate": 6.093333333333333e-07, + "loss": 0.002, + "num_tokens": 2406152.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 151.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019037781283259392, + "kl": 0.0006773397326469421, + "learning_rate": 6.09e-07, + "loss": 0.0, + "num_tokens": 2406360.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 151.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04480242729187012, + "kl": 0.09906241297721863, + "learning_rate": 6.086666666666667e-07, + "loss": 0.005, + "num_tokens": 2406732.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 151.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0019578933715820312, + "kl": 9.936392598319799e-05, + "learning_rate": 6.083333333333334e-07, + "loss": 0.0, + "num_tokens": 2406988.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 151.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008699082769453526, + "kl": 0.0008562388538848609, + "learning_rate": 6.08e-07, + "loss": 0.0, + "num_tokens": 2407270.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 151.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009310757159255445, + "kl": 0.0037299245595932007, + "learning_rate": 6.076666666666666e-07, + "loss": 0.0002, + "num_tokens": 2407506.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 151.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05726265162229538, + "kl": 0.0031598604982718825, + "learning_rate": 6.073333333333334e-07, + "loss": 0.0002, + "num_tokens": 2407780.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 151.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026106981560587883, + "kl": 0.00911693787202239, + "learning_rate": 6.070000000000001e-07, + "loss": 0.0005, + "num_tokens": 2408107.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 151.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06957307457923889, + "kl": 0.011702904012054205, + "learning_rate": 6.066666666666666e-07, + "loss": 0.0006, + "num_tokens": 2408381.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 151.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027237365022301674, + "kl": 0.00184708833694458, + "learning_rate": 6.063333333333333e-07, + "loss": 0.0001, + "num_tokens": 2408593.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 71.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 71.5, + "completions/mean_terminated_length": 71.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 151.53703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.41719388961792, + "kl": 0.04268419893924147, + "learning_rate": 6.060000000000001e-07, + "loss": 0.4343, + "num_tokens": 2409099.0, + "reward": 6.625, + "reward_std": 1.75, + "rewards/reward_combined/mean": 6.625, + "rewards/reward_combined/std": 1.75, + "step": 8183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 151.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12419752776622772, + "kl": 0.016777854412794113, + "learning_rate": 6.056666666666667e-07, + "loss": 0.0008, + "num_tokens": 2409401.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 151.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006372503004968166, + "kl": 0.00023161139688454568, + "learning_rate": 6.053333333333334e-07, + "loss": 0.0, + "num_tokens": 2409715.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 151.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0385553315281868, + "kl": 0.006798181275371462, + "learning_rate": 6.05e-07, + "loss": 0.0003, + "num_tokens": 2410029.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 151.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09173041582107544, + "kl": 0.009674896486103535, + "learning_rate": 6.046666666666666e-07, + "loss": 0.0005, + "num_tokens": 2410363.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 36.25, + "completions/mean_terminated_length": 36.25, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 151.62962962962962, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.562796592712402, + "kl": 0.11481832526624203, + "learning_rate": 6.043333333333334e-07, + "loss": -0.0591, + "num_tokens": 2410724.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 8188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 56.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 56.25, + "completions/mean_terminated_length": 56.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 151.64814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9219738245010376, + "kl": 0.10625160112977028, + "learning_rate": 6.040000000000001e-07, + "loss": 0.178, + "num_tokens": 2411185.0, + "reward": 4.25, + "reward_std": 4.092676162719727, + "rewards/reward_combined/mean": 4.25, + "rewards/reward_combined/std": 4.092676162719727, + "step": 8189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 151.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023659836500883102, + "kl": 0.0008029512246139348, + "learning_rate": 6.036666666666666e-07, + "loss": 0.0, + "num_tokens": 2411420.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 151.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02845223993062973, + "kl": 0.011919802287593484, + "learning_rate": 6.033333333333333e-07, + "loss": 0.0007, + "num_tokens": 2411694.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 151.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.25670185685157776, + "kl": 0.039418669417500496, + "learning_rate": 6.030000000000001e-07, + "loss": 0.0022, + "num_tokens": 2411960.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 151.72222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.40407943725586, + "kl": 0.036688029766082764, + "learning_rate": 6.026666666666667e-07, + "loss": 0.1726, + "num_tokens": 2412230.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 8193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 151.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004988313186913729, + "kl": 0.0003746040165424347, + "learning_rate": 6.023333333333333e-07, + "loss": 0.0, + "num_tokens": 2412474.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 151.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00045529688941314816, + "kl": 0.0012374690850265324, + "learning_rate": 6.02e-07, + "loss": 0.0001, + "num_tokens": 2412754.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 151.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08678068965673447, + "kl": 0.03031645342707634, + "learning_rate": 6.016666666666667e-07, + "loss": 0.0016, + "num_tokens": 2413042.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.25, + "completions/mean_terminated_length": 12.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 151.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06964217126369476, + "kl": 0.005343659780919552, + "learning_rate": 6.013333333333334e-07, + "loss": 0.0003, + "num_tokens": 2413319.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 151.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07598478347063065, + "kl": 0.01780601590871811, + "learning_rate": 6.01e-07, + "loss": 0.0008, + "num_tokens": 2413621.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 151.83333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.009413719177246, + "kl": 0.1728421449661255, + "learning_rate": 6.006666666666666e-07, + "loss": 0.0602, + "num_tokens": 2413935.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 8199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 151.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027501234784722328, + "kl": 0.0027347643626853824, + "learning_rate": 6.003333333333333e-07, + "loss": 0.0001, + "num_tokens": 2414263.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 151.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05053351819515228, + "kl": 0.0061572156846523285, + "learning_rate": 6.000000000000001e-07, + "loss": 0.0003, + "num_tokens": 2414575.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 151.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006172960624098778, + "kl": 0.00041054486064240336, + "learning_rate": 5.996666666666666e-07, + "loss": 0.0, + "num_tokens": 2414835.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 151.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17876817286014557, + "kl": 0.043079666793346405, + "learning_rate": 5.993333333333333e-07, + "loss": 0.0022, + "num_tokens": 2415176.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 151.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08830154687166214, + "kl": 0.033968967385590076, + "learning_rate": 5.99e-07, + "loss": 0.0019, + "num_tokens": 2415498.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 151.94444444444446, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.388647556304932, + "kl": 0.06377786211669445, + "learning_rate": 5.986666666666667e-07, + "loss": 0.059, + "num_tokens": 2415806.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 8205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 35.0, + "completions/mean_terminated_length": 35.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 151.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054406873881816864, + "kl": 0.02034620102494955, + "learning_rate": 5.983333333333334e-07, + "loss": 0.001, + "num_tokens": 2416170.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 151.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001832791487686336, + "kl": 9.834766387939453e-06, + "learning_rate": 5.98e-07, + "loss": 0.0, + "num_tokens": 2416382.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 152.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08657736331224442, + "kl": 0.017484460957348347, + "learning_rate": 5.976666666666667e-07, + "loss": 0.001, + "num_tokens": 2416666.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 152.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033489666879177094, + "kl": 0.004563005641102791, + "learning_rate": 5.973333333333333e-07, + "loss": 0.0002, + "num_tokens": 2416978.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 152.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006274409592151642, + "kl": 0.0003683716058731079, + "learning_rate": 5.970000000000001e-07, + "loss": 0.0, + "num_tokens": 2417238.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 152.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012485303916037083, + "kl": 0.0005397619243012741, + "learning_rate": 5.966666666666667e-07, + "loss": 0.0, + "num_tokens": 2417557.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 152.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014974136836826801, + "kl": 0.002691819565370679, + "learning_rate": 5.963333333333333e-07, + "loss": 0.0001, + "num_tokens": 2417846.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 152.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008965237648226321, + "kl": 0.0037411153316497803, + "learning_rate": 5.96e-07, + "loss": 0.0002, + "num_tokens": 2418082.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 152.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020208600908517838, + "kl": 0.006959666614420712, + "learning_rate": 5.956666666666668e-07, + "loss": 0.0003, + "num_tokens": 2418371.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 152.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0057055833749473095, + "kl": 0.0008480790420435369, + "learning_rate": 5.953333333333333e-07, + "loss": 0.0, + "num_tokens": 2418649.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 37.0, + "completions/mean_terminated_length": 37.0, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 152.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14198848605155945, + "kl": 0.06454429216682911, + "learning_rate": 5.95e-07, + "loss": 0.0032, + "num_tokens": 2419025.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 152.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034001439809799194, + "kl": 0.012066320516169071, + "learning_rate": 5.946666666666667e-07, + "loss": 0.0007, + "num_tokens": 2419299.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 152.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06343141943216324, + "kl": 0.0058200303465127945, + "learning_rate": 5.943333333333333e-07, + "loss": 0.0003, + "num_tokens": 2419622.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 152.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006403795909136534, + "kl": 0.0002362173399887979, + "learning_rate": 5.94e-07, + "loss": 0.0, + "num_tokens": 2419936.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 152.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0029874155297875404, + "kl": 5.364914613892324e-05, + "learning_rate": 5.936666666666667e-07, + "loss": 0.0, + "num_tokens": 2420208.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 152.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1439686119556427, + "kl": 0.010446007363498211, + "learning_rate": 5.933333333333333e-07, + "loss": 0.0006, + "num_tokens": 2420427.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 152.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10712690651416779, + "kl": 0.0130801722407341, + "learning_rate": 5.93e-07, + "loss": 0.0006, + "num_tokens": 2420722.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 152.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10352638363838196, + "kl": 0.007192038930952549, + "learning_rate": 5.926666666666668e-07, + "loss": 0.0003, + "num_tokens": 2420986.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.0, + "completions/max_terminated_length": 46.0, + "completions/mean_length": 37.25, + "completions/mean_terminated_length": 37.25, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 152.2962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2082226276397705, + "kl": 0.09929879754781723, + "learning_rate": 5.923333333333333e-07, + "loss": -0.1475, + "num_tokens": 2421351.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 8224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 152.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052132461220026016, + "kl": 0.058430589735507965, + "learning_rate": 5.92e-07, + "loss": 0.0029, + "num_tokens": 2421690.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 152.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022532550618052483, + "kl": 0.011368549428880215, + "learning_rate": 5.916666666666667e-07, + "loss": 0.0006, + "num_tokens": 2421950.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 152.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05951330065727234, + "kl": 0.008515564724802971, + "learning_rate": 5.913333333333334e-07, + "loss": 0.0004, + "num_tokens": 2422250.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 152.37037037037038, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.093994617462158, + "kl": 0.0455700708553195, + "learning_rate": 5.91e-07, + "loss": 0.1604, + "num_tokens": 2422533.0, + "reward": 5.875, + "reward_std": 3.25, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 3.25, + "step": 8228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 152.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007217801176011562, + "kl": 0.16149814426898956, + "learning_rate": 5.906666666666667e-07, + "loss": 0.0081, + "num_tokens": 2422842.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 2.75, + "completions/mean_terminated_length": 2.75, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 152.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0687694326043129, + "kl": 0.003531071590259671, + "learning_rate": 5.903333333333333e-07, + "loss": 0.0002, + "num_tokens": 2423049.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 152.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01360238716006279, + "kl": 0.0017036875651683658, + "learning_rate": 5.9e-07, + "loss": 0.0001, + "num_tokens": 2423319.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 152.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009090722538530827, + "kl": 0.00032445043325424194, + "learning_rate": 5.896666666666667e-07, + "loss": 0.0, + "num_tokens": 2423563.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 152.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007899472489953041, + "kl": 0.00012859403796028346, + "learning_rate": 5.893333333333333e-07, + "loss": 0.0, + "num_tokens": 2423819.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 152.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1338353157043457, + "kl": 0.02521562296897173, + "learning_rate": 5.89e-07, + "loss": 0.0013, + "num_tokens": 2424131.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 152.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06179427355527878, + "kl": 0.012030292768031359, + "learning_rate": 5.886666666666667e-07, + "loss": 0.0006, + "num_tokens": 2424413.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 152.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022431688383221626, + "kl": 0.00026538968086242676, + "learning_rate": 5.883333333333333e-07, + "loss": 0.0, + "num_tokens": 2424625.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 34.0, + "completions/mean_terminated_length": 34.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 152.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031857602298259735, + "kl": 0.03791419789195061, + "learning_rate": 5.88e-07, + "loss": 0.002, + "num_tokens": 2424985.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 152.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2835279703140259, + "kl": 0.0566001208499074, + "learning_rate": 5.876666666666667e-07, + "loss": 0.0031, + "num_tokens": 2425307.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 152.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04293827712535858, + "kl": 0.01038780459202826, + "learning_rate": 5.873333333333334e-07, + "loss": 0.0005, + "num_tokens": 2425631.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 152.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07900404930114746, + "kl": 0.005145840812474489, + "learning_rate": 5.87e-07, + "loss": 0.0003, + "num_tokens": 2425909.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 152.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006395588279701769, + "kl": 0.0013253133511170745, + "learning_rate": 5.866666666666667e-07, + "loss": 0.0001, + "num_tokens": 2426186.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 152.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00032005488174036145, + "kl": 8.001923561096191e-06, + "learning_rate": 5.863333333333334e-07, + "loss": 0.0, + "num_tokens": 2426406.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 8242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 27.25, + "completions/mean_terminated_length": 27.25, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 152.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15829156339168549, + "kl": 0.03112404327839613, + "learning_rate": 5.86e-07, + "loss": 0.0016, + "num_tokens": 2426743.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 152.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022487297654151917, + "kl": 0.00100295664742589, + "learning_rate": 5.856666666666667e-07, + "loss": 0.0001, + "num_tokens": 2427011.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.25, + "completions/mean_terminated_length": 5.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 152.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01829039677977562, + "kl": 0.0015137278387555853, + "learning_rate": 5.853333333333334e-07, + "loss": 0.0001, + "num_tokens": 2427232.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 152.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026136793196201324, + "kl": 0.001528024673461914, + "learning_rate": 5.85e-07, + "loss": 0.0001, + "num_tokens": 2427444.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 152.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028324615210294724, + "kl": 0.09752213209867477, + "learning_rate": 5.846666666666667e-07, + "loss": 0.0049, + "num_tokens": 2427816.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 152.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006173395086079836, + "kl": 0.00046771764755249023, + "learning_rate": 5.843333333333334e-07, + "loss": 0.0, + "num_tokens": 2428076.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 152.75925925925927, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9802799224853516, + "kl": 0.1436002403497696, + "learning_rate": 5.839999999999999e-07, + "loss": 0.0393, + "num_tokens": 2428480.0, + "reward": 2.375, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 1.25, + "step": 8249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 152.77777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.334186553955078, + "kl": 0.6243870556354523, + "learning_rate": 5.836666666666667e-07, + "loss": 0.0655, + "num_tokens": 2428804.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 8250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 152.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14654184877872467, + "kl": 0.012031571473926306, + "learning_rate": 5.833333333333334e-07, + "loss": 0.0006, + "num_tokens": 2429139.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 152.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01571676880121231, + "kl": 0.0015681475342717022, + "learning_rate": 5.83e-07, + "loss": 0.0001, + "num_tokens": 2429399.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 152.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05559800565242767, + "kl": 0.007426847703754902, + "learning_rate": 5.826666666666666e-07, + "loss": 0.0004, + "num_tokens": 2429670.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 152.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05603273585438728, + "kl": 0.01806573662906885, + "learning_rate": 5.823333333333334e-07, + "loss": 0.0008, + "num_tokens": 2429969.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 152.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029086831957101822, + "kl": 0.0017388327396474779, + "learning_rate": 5.82e-07, + "loss": 0.0001, + "num_tokens": 2430265.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 152.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03411523252725601, + "kl": 0.02471522707492113, + "learning_rate": 5.816666666666667e-07, + "loss": 0.0013, + "num_tokens": 2430554.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 7.0, + "completions/mean_terminated_length": 7.0, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 152.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017528988420963287, + "kl": 0.0006685789267066866, + "learning_rate": 5.813333333333334e-07, + "loss": 0.0, + "num_tokens": 2430790.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 152.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019602010026574135, + "kl": 0.2652120590209961, + "learning_rate": 5.81e-07, + "loss": 0.0133, + "num_tokens": 2431094.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 152.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012098081409931183, + "kl": 0.0028562715742737055, + "learning_rate": 5.806666666666667e-07, + "loss": 0.0001, + "num_tokens": 2431385.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 152.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006116027943789959, + "kl": 0.0008660210878588259, + "learning_rate": 5.803333333333334e-07, + "loss": 0.0, + "num_tokens": 2431669.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 152.9814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.41422438621521, + "kl": 0.17736125737428665, + "learning_rate": 5.8e-07, + "loss": -0.114, + "num_tokens": 2432010.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 8261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 153.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03262478858232498, + "kl": 0.0066216300474479795, + "learning_rate": 5.796666666666666e-07, + "loss": 0.0003, + "num_tokens": 2432343.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 24.75, + "completions/mean_terminated_length": 24.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 153.0185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.715142726898193, + "kl": 0.129999328404665, + "learning_rate": 5.793333333333334e-07, + "loss": 0.1642, + "num_tokens": 2432666.0, + "reward": 4.0, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 4.041451930999756, + "step": 8263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 153.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02258777618408203, + "kl": 0.0113847223110497, + "learning_rate": 5.79e-07, + "loss": 0.0006, + "num_tokens": 2432926.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 153.05555555555554, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.074158191680908, + "kl": 0.2561429899651557, + "learning_rate": 5.786666666666667e-07, + "loss": 0.1084, + "num_tokens": 2433204.0, + "reward": 5.875, + "reward_std": 3.25, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 3.25, + "step": 8265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 153.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02532315067946911, + "kl": 0.0007674284279346466, + "learning_rate": 5.783333333333333e-07, + "loss": 0.0, + "num_tokens": 2433464.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.25, + "completions/mean_terminated_length": 5.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 153.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01998990774154663, + "kl": 0.0006948375084903091, + "learning_rate": 5.78e-07, + "loss": 0.0, + "num_tokens": 2433685.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 153.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.562995195388794, + "kl": 0.12784771621227264, + "learning_rate": 5.776666666666667e-07, + "loss": 0.0064, + "num_tokens": 2434055.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 153.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010777637362480164, + "kl": 0.0026645335310604423, + "learning_rate": 5.773333333333334e-07, + "loss": 0.0001, + "num_tokens": 2434321.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 153.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09361322969198227, + "kl": 0.011937583331018686, + "learning_rate": 5.77e-07, + "loss": 0.0006, + "num_tokens": 2434623.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 37.5, + "completions/mean_terminated_length": 37.5, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 153.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08961789309978485, + "kl": 0.06746284291148186, + "learning_rate": 5.766666666666666e-07, + "loss": 0.0034, + "num_tokens": 2435001.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 153.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007029331289231777, + "kl": 0.0008635367848910391, + "learning_rate": 5.763333333333334e-07, + "loss": 0.0, + "num_tokens": 2435277.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 153.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017782054841518402, + "kl": 0.000344881416822318, + "learning_rate": 5.760000000000001e-07, + "loss": 0.0, + "num_tokens": 2435533.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 153.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015332967974245548, + "kl": 0.0010843120398931205, + "learning_rate": 5.756666666666666e-07, + "loss": 0.0001, + "num_tokens": 2435813.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 153.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04747886583209038, + "kl": 0.005562614183872938, + "learning_rate": 5.753333333333333e-07, + "loss": 0.0003, + "num_tokens": 2436099.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 153.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009058780036866665, + "kl": 0.0003537870943546295, + "learning_rate": 5.750000000000001e-07, + "loss": 0.0, + "num_tokens": 2436343.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 153.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09518574178218842, + "kl": 0.0034202428651042283, + "learning_rate": 5.746666666666667e-07, + "loss": 0.0002, + "num_tokens": 2436616.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 153.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0022815570700913668, + "kl": 4.509339669311885e-05, + "learning_rate": 5.743333333333334e-07, + "loss": 0.0, + "num_tokens": 2436888.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 153.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3772287368774414, + "kl": 0.018303331453353167, + "learning_rate": 5.74e-07, + "loss": 0.0009, + "num_tokens": 2437146.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 153.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03290873393416405, + "kl": 0.04473191127181053, + "learning_rate": 5.736666666666666e-07, + "loss": 0.0022, + "num_tokens": 2437550.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 8280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 153.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009710745071060956, + "kl": 0.003725379705429077, + "learning_rate": 5.733333333333334e-07, + "loss": 0.0002, + "num_tokens": 2437786.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 153.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04793475568294525, + "kl": 0.005475924350321293, + "learning_rate": 5.730000000000001e-07, + "loss": 0.0003, + "num_tokens": 2438098.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 37.25, + "completions/mean_terminated_length": 37.25, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 153.38888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.762014389038086, + "kl": 0.048394862562417984, + "learning_rate": 5.726666666666666e-07, + "loss": 0.0175, + "num_tokens": 2438463.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 8283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 153.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00917720515280962, + "kl": 0.15728817880153656, + "learning_rate": 5.723333333333333e-07, + "loss": 0.0078, + "num_tokens": 2438774.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 153.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03727919980883598, + "kl": 0.004796176450327039, + "learning_rate": 5.720000000000001e-07, + "loss": 0.0002, + "num_tokens": 2439105.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 37.75, + "completions/mean_terminated_length": 37.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 153.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04206138476729393, + "kl": 0.027701175771653652, + "learning_rate": 5.716666666666667e-07, + "loss": 0.0014, + "num_tokens": 2439480.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 153.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07937157154083252, + "kl": 0.012952920515090227, + "learning_rate": 5.713333333333333e-07, + "loss": 0.0007, + "num_tokens": 2439808.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 153.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0002558810811024159, + "kl": 5.826354026794434e-06, + "learning_rate": 5.71e-07, + "loss": 0.0, + "num_tokens": 2440028.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 8288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 153.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017458437010645866, + "kl": 0.0009235720208380371, + "learning_rate": 5.706666666666667e-07, + "loss": 0.0, + "num_tokens": 2440294.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 24.75, + "completions/mean_terminated_length": 24.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 153.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09783709794282913, + "kl": 0.016546230297535658, + "learning_rate": 5.703333333333334e-07, + "loss": 0.0008, + "num_tokens": 2440621.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 153.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016708340495824814, + "kl": 0.00336581066949293, + "learning_rate": 5.7e-07, + "loss": 0.0002, + "num_tokens": 2440913.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 153.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09030290693044662, + "kl": 0.018074131337925792, + "learning_rate": 5.696666666666666e-07, + "loss": 0.001, + "num_tokens": 2441220.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 24.25, + "completions/mean_terminated_length": 24.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 153.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.48011282086372375, + "kl": 0.08293337374925613, + "learning_rate": 5.693333333333333e-07, + "loss": 0.0032, + "num_tokens": 2441537.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 153.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056277938187122345, + "kl": 0.02923646569252014, + "learning_rate": 5.690000000000001e-07, + "loss": 0.0014, + "num_tokens": 2441811.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 153.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03138343244791031, + "kl": 0.001311879779677838, + "learning_rate": 5.686666666666667e-07, + "loss": 0.0001, + "num_tokens": 2442045.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 153.62962962962962, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.328172206878662, + "kl": 0.24690625071525574, + "learning_rate": 5.683333333333333e-07, + "loss": 0.1248, + "num_tokens": 2442358.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 8296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 153.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00587368942797184, + "kl": 0.000370153778931126, + "learning_rate": 5.68e-07, + "loss": 0.0, + "num_tokens": 2442618.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 153.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08754284679889679, + "kl": 0.004288829397410154, + "learning_rate": 5.676666666666667e-07, + "loss": 0.0003, + "num_tokens": 2442845.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 153.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04405666142702103, + "kl": 0.002103633596561849, + "learning_rate": 5.673333333333334e-07, + "loss": 0.0001, + "num_tokens": 2443143.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 153.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005244787898845971, + "kl": 0.001316857582423836, + "learning_rate": 5.67e-07, + "loss": 0.0001, + "num_tokens": 2443420.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 153.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012456518597900867, + "kl": 0.0008397191413678229, + "learning_rate": 5.666666666666667e-07, + "loss": 0.0, + "num_tokens": 2443680.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 153.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031383346766233444, + "kl": 0.005854287534020841, + "learning_rate": 5.663333333333333e-07, + "loss": 0.0002, + "num_tokens": 2443996.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 153.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0751035213470459, + "kl": 0.048893094062805176, + "learning_rate": 5.660000000000001e-07, + "loss": 0.0024, + "num_tokens": 2444288.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 153.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01674734055995941, + "kl": 0.0001826956868171692, + "learning_rate": 5.656666666666667e-07, + "loss": 0.0, + "num_tokens": 2444498.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 153.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026693180203437805, + "kl": 0.0016957670450210571, + "learning_rate": 5.653333333333333e-07, + "loss": 0.0001, + "num_tokens": 2444710.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 153.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017887800931930542, + "kl": 0.005099207162857056, + "learning_rate": 5.65e-07, + "loss": 0.0003, + "num_tokens": 2444978.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 41.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 41.75, + "completions/mean_terminated_length": 41.75, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 153.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0458885096013546, + "kl": 0.01571763912215829, + "learning_rate": 5.646666666666667e-07, + "loss": 0.0006, + "num_tokens": 2445365.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 153.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06181563809514046, + "kl": 0.002845512703061104, + "learning_rate": 5.643333333333333e-07, + "loss": 0.0001, + "num_tokens": 2445657.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 153.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03170020133256912, + "kl": 0.00026510655879974365, + "learning_rate": 5.64e-07, + "loss": 0.0, + "num_tokens": 2445869.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 153.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031905390322208405, + "kl": 0.002102102618664503, + "learning_rate": 5.636666666666667e-07, + "loss": 0.0001, + "num_tokens": 2446192.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 153.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09781618416309357, + "kl": 0.03031076118350029, + "learning_rate": 5.633333333333333e-07, + "loss": 0.0014, + "num_tokens": 2446538.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 153.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018083926290273666, + "kl": 0.0005412757454905659, + "learning_rate": 5.63e-07, + "loss": 0.0, + "num_tokens": 2446854.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 27.75, + "completions/mean_terminated_length": 27.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 153.94444444444446, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9756832122802734, + "kl": 0.061098862439394, + "learning_rate": 5.626666666666667e-07, + "loss": 0.0038, + "num_tokens": 2447201.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 8313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 153.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021097060292959213, + "kl": 0.008475386537611485, + "learning_rate": 5.623333333333333e-07, + "loss": 0.0004, + "num_tokens": 2447532.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 153.9814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.73818039894104, + "kl": 0.04998205788433552, + "learning_rate": 5.62e-07, + "loss": 0.1493, + "num_tokens": 2447849.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 8315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 154.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03913053125143051, + "kl": 0.008828048594295979, + "learning_rate": 5.616666666666668e-07, + "loss": 0.0004, + "num_tokens": 2448139.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 154.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008888961747288704, + "kl": 0.000478645961266011, + "learning_rate": 5.613333333333333e-07, + "loss": 0.0, + "num_tokens": 2448458.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 154.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05503654107451439, + "kl": 0.004644736181944609, + "learning_rate": 5.61e-07, + "loss": 0.0002, + "num_tokens": 2448732.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 154.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04165855422616005, + "kl": 0.0015937138814479113, + "learning_rate": 5.606666666666667e-07, + "loss": 0.0001, + "num_tokens": 2449006.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 154.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006506198085844517, + "kl": 0.00023494711422245018, + "learning_rate": 5.603333333333334e-07, + "loss": 0.0, + "num_tokens": 2449320.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 154.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000523120048455894, + "kl": 1.858919858932495e-05, + "learning_rate": 5.6e-07, + "loss": 0.0, + "num_tokens": 2449532.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 154.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004662947729229927, + "kl": 0.00022652148618362844, + "learning_rate": 5.596666666666667e-07, + "loss": 0.0, + "num_tokens": 2449752.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 154.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030857738107442856, + "kl": 0.001850803499110043, + "learning_rate": 5.593333333333333e-07, + "loss": 0.0001, + "num_tokens": 2450050.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 154.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08106081187725067, + "kl": 0.007502306718379259, + "learning_rate": 5.59e-07, + "loss": 0.0004, + "num_tokens": 2450386.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 154.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026218606159090996, + "kl": 0.0015904158353805542, + "learning_rate": 5.586666666666667e-07, + "loss": 0.0001, + "num_tokens": 2450598.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 154.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00042798908543772995, + "kl": 0.0012370496988296509, + "learning_rate": 5.583333333333333e-07, + "loss": 0.0001, + "num_tokens": 2450878.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 154.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02619774080812931, + "kl": 0.0007577687720186077, + "learning_rate": 5.58e-07, + "loss": 0.0, + "num_tokens": 2451134.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 154.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030264941975474358, + "kl": 0.004409928224049509, + "learning_rate": 5.576666666666667e-07, + "loss": 0.0002, + "num_tokens": 2451422.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 154.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029210738837718964, + "kl": 0.0733129046857357, + "learning_rate": 5.573333333333333e-07, + "loss": 0.0037, + "num_tokens": 2451793.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 154.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005117594730108976, + "kl": 0.0003747120499610901, + "learning_rate": 5.57e-07, + "loss": 0.0, + "num_tokens": 2452037.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 154.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03576855733990669, + "kl": 0.00482076033949852, + "learning_rate": 5.566666666666667e-07, + "loss": 0.0002, + "num_tokens": 2452364.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 39.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 39.25, + "completions/mean_terminated_length": 39.25, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 154.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07462895661592484, + "kl": 0.03889967314898968, + "learning_rate": 5.563333333333334e-07, + "loss": 0.002, + "num_tokens": 2452745.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 154.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018847031518816948, + "kl": 0.26534825563430786, + "learning_rate": 5.56e-07, + "loss": 0.0133, + "num_tokens": 2453049.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 154.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06223014369606972, + "kl": 0.011949718929827213, + "learning_rate": 5.556666666666667e-07, + "loss": 0.0006, + "num_tokens": 2453326.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 154.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10442297905683517, + "kl": 0.032682210206985474, + "learning_rate": 5.553333333333334e-07, + "loss": 0.0017, + "num_tokens": 2453630.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 154.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.1021322011947632, + "kl": 0.13737642765045166, + "learning_rate": 5.55e-07, + "loss": 0.0069, + "num_tokens": 2453969.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 154.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00566828390583396, + "kl": 0.0003056225832551718, + "learning_rate": 5.546666666666667e-07, + "loss": 0.0, + "num_tokens": 2454231.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 154.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006953234784305096, + "kl": 0.16165469586849213, + "learning_rate": 5.543333333333333e-07, + "loss": 0.0081, + "num_tokens": 2454540.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 154.42592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.634182929992676, + "kl": 0.04869305342435837, + "learning_rate": 5.54e-07, + "loss": 0.147, + "num_tokens": 2454883.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 8339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 31.75, + "completions/mean_terminated_length": 31.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 154.44444444444446, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.85901141166687, + "kl": 0.022627011872828007, + "learning_rate": 5.536666666666667e-07, + "loss": 0.1245, + "num_tokens": 2455238.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 8340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 154.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005726439878344536, + "kl": 0.0005736123712267727, + "learning_rate": 5.533333333333334e-07, + "loss": 0.0, + "num_tokens": 2455472.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 154.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01484580710530281, + "kl": 0.0012317707878537476, + "learning_rate": 5.529999999999999e-07, + "loss": 0.0001, + "num_tokens": 2455750.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 154.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03656516224145889, + "kl": 0.012933549180161208, + "learning_rate": 5.526666666666667e-07, + "loss": 0.0007, + "num_tokens": 2456037.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 154.5185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8089420795440674, + "kl": 0.044436972588300705, + "learning_rate": 5.523333333333334e-07, + "loss": 0.0991, + "num_tokens": 2456375.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 8344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 154.53703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.845674991607666, + "kl": 0.02177418302744627, + "learning_rate": 5.52e-07, + "loss": -0.0595, + "num_tokens": 2456720.0, + "reward": 5.25, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 4.5, + "step": 8345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 154.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000318619393510744, + "kl": 8.121132850646973e-06, + "learning_rate": 5.516666666666666e-07, + "loss": 0.0, + "num_tokens": 2456940.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 8346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 154.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005950066260993481, + "kl": 0.0003937259316444397, + "learning_rate": 5.513333333333334e-07, + "loss": 0.0, + "num_tokens": 2457200.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 154.59259259259258, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.636676073074341, + "kl": 0.07898950390517712, + "learning_rate": 5.51e-07, + "loss": -0.0107, + "num_tokens": 2457503.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 8348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 154.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029922667890787125, + "kl": 0.001224012579768896, + "learning_rate": 5.506666666666667e-07, + "loss": 0.0001, + "num_tokens": 2457808.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 154.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007834872230887413, + "kl": 0.0016319826245307922, + "learning_rate": 5.503333333333334e-07, + "loss": 0.0001, + "num_tokens": 2458024.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 154.64814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.977325916290283, + "kl": 0.013910597190260887, + "learning_rate": 5.499999999999999e-07, + "loss": -0.1529, + "num_tokens": 2458292.0, + "reward": 7.625, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.625, + "rewards/reward_combined/std": 0.25, + "step": 8351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 154.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13469654321670532, + "kl": 0.021035901736468077, + "learning_rate": 5.496666666666667e-07, + "loss": 0.001, + "num_tokens": 2458592.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 154.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04311174899339676, + "kl": 0.006896013393998146, + "learning_rate": 5.493333333333334e-07, + "loss": 0.0003, + "num_tokens": 2458870.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 154.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020176585763692856, + "kl": 0.00520420353859663, + "learning_rate": 5.49e-07, + "loss": 0.0003, + "num_tokens": 2459138.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 154.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060214847326278687, + "kl": 0.005490035400725901, + "learning_rate": 5.486666666666666e-07, + "loss": 0.0003, + "num_tokens": 2459396.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 154.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0613739900290966, + "kl": 0.0031856546411290765, + "learning_rate": 5.483333333333334e-07, + "loss": 0.0002, + "num_tokens": 2459666.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 154.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011704935692250729, + "kl": 0.007982588838785887, + "learning_rate": 5.48e-07, + "loss": 0.0004, + "num_tokens": 2459938.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 154.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04813944175839424, + "kl": 0.005820095539093018, + "learning_rate": 5.476666666666667e-07, + "loss": 0.0003, + "num_tokens": 2460250.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 33.0, + "completions/mean_terminated_length": 33.0, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 154.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10329499840736389, + "kl": 0.05913754925131798, + "learning_rate": 5.473333333333333e-07, + "loss": 0.003, + "num_tokens": 2460598.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 154.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004868348594754934, + "kl": 0.00010152508912142366, + "learning_rate": 5.47e-07, + "loss": 0.0, + "num_tokens": 2460868.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 154.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017690882086753845, + "kl": 0.0021029120252933353, + "learning_rate": 5.466666666666667e-07, + "loss": 0.0001, + "num_tokens": 2461138.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 154.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04372517764568329, + "kl": 0.0021331667667254806, + "learning_rate": 5.463333333333334e-07, + "loss": 0.0001, + "num_tokens": 2461392.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 154.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02498186193406582, + "kl": 0.006004116032272577, + "learning_rate": 5.46e-07, + "loss": 0.0003, + "num_tokens": 2461683.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 154.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04266771301627159, + "kl": 0.008875719271600246, + "learning_rate": 5.456666666666666e-07, + "loss": 0.0004, + "num_tokens": 2462013.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 154.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024272803217172623, + "kl": 0.04772520437836647, + "learning_rate": 5.453333333333334e-07, + "loss": 0.0024, + "num_tokens": 2462417.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 8365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 154.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0010209826286882162, + "kl": 0.003725387156009674, + "learning_rate": 5.450000000000001e-07, + "loss": 0.0002, + "num_tokens": 2462653.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 35.25, + "completions/mean_terminated_length": 35.25, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 154.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3208865225315094, + "kl": 0.0930166020989418, + "learning_rate": 5.446666666666666e-07, + "loss": 0.0047, + "num_tokens": 2463022.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 28.5, + "completions/mean_terminated_length": 28.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 154.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07350413501262665, + "kl": 0.03143562376499176, + "learning_rate": 5.443333333333333e-07, + "loss": 0.0016, + "num_tokens": 2463356.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 154.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007228239439427853, + "kl": 0.0004351213574409485, + "learning_rate": 5.44e-07, + "loss": 0.0, + "num_tokens": 2463564.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.01315789483487606, + "clip_ratio/low_min": 0.01315789483487606, + "clip_ratio/region_mean": 0.01315789483487606, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 155.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.025207042694092, + "kl": 0.02106407703831792, + "learning_rate": 5.436666666666667e-07, + "loss": -0.0038, + "num_tokens": 2463856.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 8370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 155.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005544027779251337, + "kl": 0.00030422210693359375, + "learning_rate": 5.433333333333334e-07, + "loss": 0.0, + "num_tokens": 2464116.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 155.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09991784393787384, + "kl": 0.041562771424651146, + "learning_rate": 5.43e-07, + "loss": 0.0021, + "num_tokens": 2464411.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 155.05555555555554, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.10620641708374, + "kl": 0.11383737996220589, + "learning_rate": 5.426666666666666e-07, + "loss": 0.1118, + "num_tokens": 2464709.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 8373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 40.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.0, + "completions/max_terminated_length": 46.0, + "completions/mean_length": 40.25, + "completions/mean_terminated_length": 40.25, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 155.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12902311980724335, + "kl": 0.052536096423864365, + "learning_rate": 5.423333333333334e-07, + "loss": 0.0027, + "num_tokens": 2465094.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 155.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00619744136929512, + "kl": 0.0010310067445971072, + "learning_rate": 5.420000000000001e-07, + "loss": 0.0001, + "num_tokens": 2465378.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 155.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010383947752416134, + "kl": 0.0004548997094389051, + "learning_rate": 5.416666666666666e-07, + "loss": 0.0, + "num_tokens": 2465651.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8376 + }, + { + "clip_ratio/high_max": 0.0058139534667134285, + "clip_ratio/high_mean": 0.0058139534667134285, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0058139534667134285, + "completion_length": 40.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 47.0, + "completions/max_terminated_length": 47.0, + "completions/mean_length": 40.25, + "completions/mean_terminated_length": 40.25, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 155.12962962962962, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265976905822754, + "kl": 0.08938579261302948, + "learning_rate": 5.413333333333333e-07, + "loss": -0.0056, + "num_tokens": 2466028.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 8377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 155.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08833947032690048, + "kl": 0.010712239891290665, + "learning_rate": 5.410000000000001e-07, + "loss": 0.0005, + "num_tokens": 2466272.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 155.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019876381382346153, + "kl": 0.005873143672943115, + "learning_rate": 5.406666666666667e-07, + "loss": 0.0003, + "num_tokens": 2466554.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 155.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058269817382097244, + "kl": 0.00956787308678031, + "learning_rate": 5.403333333333333e-07, + "loss": 0.0005, + "num_tokens": 2466878.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 155.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06682496517896652, + "kl": 0.0031241700053215027, + "learning_rate": 5.4e-07, + "loss": 0.0002, + "num_tokens": 2467094.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 155.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11336594820022583, + "kl": 0.038844432681798935, + "learning_rate": 5.396666666666666e-07, + "loss": 0.002, + "num_tokens": 2467409.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.25, + "completions/mean_terminated_length": 6.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 155.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022847386077046394, + "kl": 0.0005056319059804082, + "learning_rate": 5.393333333333334e-07, + "loss": 0.0, + "num_tokens": 2467642.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 155.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046104151755571365, + "kl": 0.02171806525439024, + "learning_rate": 5.39e-07, + "loss": 0.0011, + "num_tokens": 2467915.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 155.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006710353307425976, + "kl": 0.00022089167396188714, + "learning_rate": 5.386666666666666e-07, + "loss": 0.0, + "num_tokens": 2468229.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 155.2962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.409780979156494, + "kl": 0.021229079458862543, + "learning_rate": 5.383333333333333e-07, + "loss": -0.0524, + "num_tokens": 2468518.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 8386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 2.5, + "completions/mean_terminated_length": 2.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 155.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04751231148838997, + "kl": 0.0008119717240333557, + "learning_rate": 5.380000000000001e-07, + "loss": 0.0, + "num_tokens": 2468724.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 155.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0048167286440730095, + "kl": 0.00024247169494628906, + "learning_rate": 5.376666666666667e-07, + "loss": 0.0, + "num_tokens": 2468944.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 155.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011471696197986603, + "kl": 0.007820246275514364, + "learning_rate": 5.373333333333333e-07, + "loss": 0.0004, + "num_tokens": 2469216.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 155.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031831782311201096, + "kl": 0.012028172612190247, + "learning_rate": 5.37e-07, + "loss": 0.0007, + "num_tokens": 2469490.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 155.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00025179359363391995, + "kl": 5.7891011238098145e-06, + "learning_rate": 5.366666666666667e-07, + "loss": 0.0, + "num_tokens": 2469710.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 8391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 155.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06769102066755295, + "kl": 0.0086826549959369, + "learning_rate": 5.363333333333334e-07, + "loss": 0.0003, + "num_tokens": 2470028.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 155.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.087205171585083, + "kl": 0.22589676827192307, + "learning_rate": 5.36e-07, + "loss": 0.0115, + "num_tokens": 2470402.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 155.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01321250107139349, + "kl": 0.011940506286919117, + "learning_rate": 5.356666666666667e-07, + "loss": 0.0006, + "num_tokens": 2470682.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 37.0, + "completions/mean_terminated_length": 37.0, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 155.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07551845908164978, + "kl": 0.05878164991736412, + "learning_rate": 5.353333333333333e-07, + "loss": 0.0029, + "num_tokens": 2471058.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 155.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.48009926080703735, + "kl": 0.09998535551130772, + "learning_rate": 5.350000000000001e-07, + "loss": 0.0052, + "num_tokens": 2471398.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 155.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011485875584185123, + "kl": 0.0004405789077281952, + "learning_rate": 5.346666666666667e-07, + "loss": 0.0, + "num_tokens": 2471658.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 155.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15054261684417725, + "kl": 0.018153753131628036, + "learning_rate": 5.343333333333333e-07, + "loss": 0.001, + "num_tokens": 2471960.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 155.53703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1454274654388428, + "kl": 0.11350664414931089, + "learning_rate": 5.34e-07, + "loss": -0.0078, + "num_tokens": 2472278.0, + "reward": 4.375, + "reward_std": 4.190763473510742, + "rewards/reward_combined/mean": 4.375, + "rewards/reward_combined/std": 4.190763473510742, + "step": 8399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 155.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019624611362814903, + "kl": 0.2651744931936264, + "learning_rate": 5.336666666666667e-07, + "loss": 0.0133, + "num_tokens": 2472582.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 155.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043150998651981354, + "kl": 0.042211174964904785, + "learning_rate": 5.333333333333333e-07, + "loss": 0.0021, + "num_tokens": 2472986.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 8401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 155.59259259259258, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.781017303466797, + "kl": 0.08708360604941845, + "learning_rate": 5.33e-07, + "loss": 0.0482, + "num_tokens": 2473306.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 8402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 155.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014792831614613533, + "kl": 0.003580319113098085, + "learning_rate": 5.326666666666667e-07, + "loss": 0.0002, + "num_tokens": 2473574.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 155.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1653987467288971, + "kl": 0.011425634380429983, + "learning_rate": 5.323333333333333e-07, + "loss": 0.0006, + "num_tokens": 2473899.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 155.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011270777322351933, + "kl": 0.0037083476781845093, + "learning_rate": 5.32e-07, + "loss": 0.0002, + "num_tokens": 2474135.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 155.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14027844369411469, + "kl": 0.027131887152791023, + "learning_rate": 5.316666666666667e-07, + "loss": 0.0013, + "num_tokens": 2474440.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 39.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.0, + "completions/max_terminated_length": 54.0, + "completions/mean_length": 39.5, + "completions/mean_terminated_length": 39.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 155.6851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.275292158126831, + "kl": 0.08845902606844902, + "learning_rate": 5.313333333333333e-07, + "loss": 0.035, + "num_tokens": 2474818.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 8407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 155.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016922705108299851, + "kl": 6.44649080641102e-05, + "learning_rate": 5.31e-07, + "loss": 0.0, + "num_tokens": 2475090.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 155.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03561645373702049, + "kl": 0.0010564652038738132, + "learning_rate": 5.306666666666668e-07, + "loss": 0.0001, + "num_tokens": 2475347.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 9.75, + "completions/mean_terminated_length": 9.75, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 155.74074074074073, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.05357551574707, + "kl": 0.01311790058389306, + "learning_rate": 5.303333333333333e-07, + "loss": 0.069, + "num_tokens": 2475610.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 8410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 155.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0975395143032074, + "kl": 0.012654355959966779, + "learning_rate": 5.3e-07, + "loss": 0.0006, + "num_tokens": 2475884.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 155.77777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.3723464012146, + "kl": 0.013765290612354875, + "learning_rate": 5.296666666666667e-07, + "loss": -0.0336, + "num_tokens": 2476164.0, + "reward": 7.625, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.625, + "rewards/reward_combined/std": 0.25, + "step": 8412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 155.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03530529513955116, + "kl": 0.00457672169432044, + "learning_rate": 5.293333333333333e-07, + "loss": 0.0002, + "num_tokens": 2476424.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 155.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08188877254724503, + "kl": 0.008390740025788546, + "learning_rate": 5.29e-07, + "loss": 0.0004, + "num_tokens": 2476726.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 155.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03391231968998909, + "kl": 0.0029532848857343197, + "learning_rate": 5.286666666666667e-07, + "loss": 0.0001, + "num_tokens": 2476986.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 155.85185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4728827476501465, + "kl": 0.7759107742458582, + "learning_rate": 5.283333333333333e-07, + "loss": -0.0477, + "num_tokens": 2477273.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 8416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 155.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007028269115835428, + "kl": 0.16169343888759613, + "learning_rate": 5.28e-07, + "loss": 0.0081, + "num_tokens": 2477582.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 155.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002486934419721365, + "kl": 1.662224531173706e-05, + "learning_rate": 5.276666666666667e-07, + "loss": 0.0, + "num_tokens": 2477794.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 155.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006349485483951867, + "kl": 0.0013061835779808462, + "learning_rate": 5.273333333333333e-07, + "loss": 0.0001, + "num_tokens": 2478071.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 155.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02279621548950672, + "kl": 0.04854956269264221, + "learning_rate": 5.27e-07, + "loss": 0.0024, + "num_tokens": 2478413.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 29.75, + "completions/mean_terminated_length": 29.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 155.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05165979266166687, + "kl": 0.015593254938721657, + "learning_rate": 5.266666666666667e-07, + "loss": 0.0008, + "num_tokens": 2478760.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 155.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023921294137835503, + "kl": 0.0012628336844500154, + "learning_rate": 5.263333333333333e-07, + "loss": 0.0001, + "num_tokens": 2478976.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 155.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038757625967264175, + "kl": 0.0057800025679171085, + "learning_rate": 5.26e-07, + "loss": 0.0003, + "num_tokens": 2479314.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 156.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04575493559241295, + "kl": 0.005509747192263603, + "learning_rate": 5.256666666666667e-07, + "loss": 0.0003, + "num_tokens": 2479626.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 156.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004278066102415323, + "kl": 0.001171433919807896, + "learning_rate": 5.253333333333334e-07, + "loss": 0.0001, + "num_tokens": 2479886.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 156.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06770186126232147, + "kl": 0.0031263157725334167, + "learning_rate": 5.25e-07, + "loss": 0.0002, + "num_tokens": 2480102.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 38.0, + "completions/mean_terminated_length": 38.0, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 156.05555555555554, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.317131996154785, + "kl": 0.2115035280585289, + "learning_rate": 5.246666666666667e-07, + "loss": 0.1058, + "num_tokens": 2480482.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 8427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 156.07407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.500802993774414, + "kl": 0.5088626421420486, + "learning_rate": 5.243333333333334e-07, + "loss": 0.0665, + "num_tokens": 2480768.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 8428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 156.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015126118436455727, + "kl": 0.00393518028431572, + "learning_rate": 5.24e-07, + "loss": 0.0002, + "num_tokens": 2481026.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 156.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048743266612291336, + "kl": 0.004803207004442811, + "learning_rate": 5.236666666666667e-07, + "loss": 0.0002, + "num_tokens": 2481328.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 156.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01772715523838997, + "kl": 0.005148844327777624, + "learning_rate": 5.233333333333333e-07, + "loss": 0.0003, + "num_tokens": 2481596.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 156.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10989715158939362, + "kl": 0.011333472561091185, + "learning_rate": 5.23e-07, + "loss": 0.0006, + "num_tokens": 2481883.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 156.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018481656908988953, + "kl": 0.00038522534305229783, + "learning_rate": 5.226666666666667e-07, + "loss": 0.0, + "num_tokens": 2482126.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 156.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08538296818733215, + "kl": 0.0022356927511282265, + "learning_rate": 5.223333333333334e-07, + "loss": 0.0001, + "num_tokens": 2482339.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 156.2037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.616231918334961, + "kl": 0.26572537142783403, + "learning_rate": 5.219999999999999e-07, + "loss": 0.018, + "num_tokens": 2482615.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 8435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 156.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017566002905368805, + "kl": 0.002983683720231056, + "learning_rate": 5.216666666666667e-07, + "loss": 0.0002, + "num_tokens": 2482949.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 156.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053898610174655914, + "kl": 0.004574377555400133, + "learning_rate": 5.213333333333334e-07, + "loss": 0.0002, + "num_tokens": 2483247.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 156.25925925925927, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.173861026763916, + "kl": 0.09855138882994652, + "learning_rate": 5.21e-07, + "loss": 0.313, + "num_tokens": 2483547.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 8438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 156.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024702956900000572, + "kl": 0.04162203148007393, + "learning_rate": 5.206666666666666e-07, + "loss": 0.0021, + "num_tokens": 2483952.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 8439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 156.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5571836829185486, + "kl": 0.07365526258945465, + "learning_rate": 5.203333333333334e-07, + "loss": 0.0045, + "num_tokens": 2484227.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 156.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00032187625765800476, + "kl": 8.128583431243896e-06, + "learning_rate": 5.2e-07, + "loss": 0.0, + "num_tokens": 2484447.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 8441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 156.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02686251327395439, + "kl": 0.005212683929130435, + "learning_rate": 5.196666666666667e-07, + "loss": 0.0003, + "num_tokens": 2484735.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 156.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02043980360031128, + "kl": 0.011767649091780186, + "learning_rate": 5.193333333333334e-07, + "loss": 0.0006, + "num_tokens": 2484995.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 156.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11391869187355042, + "kl": 0.043839987367391586, + "learning_rate": 5.189999999999999e-07, + "loss": 0.0022, + "num_tokens": 2485293.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 156.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021185003221035004, + "kl": 0.2648857831954956, + "learning_rate": 5.186666666666667e-07, + "loss": 0.0132, + "num_tokens": 2485597.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 156.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017053665360435843, + "kl": 4.1743118345038965e-05, + "learning_rate": 5.183333333333334e-07, + "loss": 0.0, + "num_tokens": 2485869.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 156.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03420504927635193, + "kl": 0.0038915553595870733, + "learning_rate": 5.18e-07, + "loss": 0.0002, + "num_tokens": 2486139.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 156.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002330042188987136, + "kl": 0.00031235069036483765, + "learning_rate": 5.176666666666666e-07, + "loss": 0.0, + "num_tokens": 2486399.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 156.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04137839376926422, + "kl": 0.007751275785267353, + "learning_rate": 5.173333333333334e-07, + "loss": 0.0004, + "num_tokens": 2486694.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 156.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022049132734537125, + "kl": 0.0014656584244221449, + "learning_rate": 5.17e-07, + "loss": 0.0001, + "num_tokens": 2486972.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 156.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026242000982165337, + "kl": 0.0016636699438095093, + "learning_rate": 5.166666666666667e-07, + "loss": 0.0001, + "num_tokens": 2487184.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 156.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028206482529640198, + "kl": 0.055699342861771584, + "learning_rate": 5.163333333333333e-07, + "loss": 0.0028, + "num_tokens": 2487520.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 156.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015877556055784225, + "kl": 0.00019871890981448814, + "learning_rate": 5.16e-07, + "loss": 0.0, + "num_tokens": 2487776.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 156.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023555463179945946, + "kl": 0.07299611158668995, + "learning_rate": 5.156666666666667e-07, + "loss": 0.0037, + "num_tokens": 2488147.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 156.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009435631334781647, + "kl": 0.15927845239639282, + "learning_rate": 5.153333333333334e-07, + "loss": 0.008, + "num_tokens": 2488457.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 156.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05060119181871414, + "kl": 0.0022037744492990896, + "learning_rate": 5.15e-07, + "loss": 0.0001, + "num_tokens": 2488676.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 156.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15501393377780914, + "kl": 0.016626416007056832, + "learning_rate": 5.146666666666666e-07, + "loss": 0.0009, + "num_tokens": 2489008.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 156.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04478546977043152, + "kl": 0.00516450684517622, + "learning_rate": 5.143333333333334e-07, + "loss": 0.0003, + "num_tokens": 2489320.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 156.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24517595767974854, + "kl": 0.04788942728191614, + "learning_rate": 5.140000000000001e-07, + "loss": 0.0024, + "num_tokens": 2489657.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 29.5, + "completions/mean_terminated_length": 29.5, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 156.66666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2047641277313232, + "kl": 0.0207534022629261, + "learning_rate": 5.136666666666666e-07, + "loss": 0.0012, + "num_tokens": 2490003.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 8460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 156.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013539710082113743, + "kl": 0.0007548865396529436, + "learning_rate": 5.133333333333333e-07, + "loss": 0.0, + "num_tokens": 2490270.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 37.25, + "completions/mean_terminated_length": 37.25, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 156.7037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6001968383789062, + "kl": 0.07414672523736954, + "learning_rate": 5.13e-07, + "loss": -0.0103, + "num_tokens": 2490635.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 8462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 156.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01546122133731842, + "kl": 0.006629306124523282, + "learning_rate": 5.126666666666667e-07, + "loss": 0.0003, + "num_tokens": 2490927.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 40.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 40.25, + "completions/mean_terminated_length": 40.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 156.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06342559307813644, + "kl": 0.03193545900285244, + "learning_rate": 5.123333333333334e-07, + "loss": 0.0017, + "num_tokens": 2491312.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 156.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001116510247811675, + "kl": 0.0037094801664352417, + "learning_rate": 5.12e-07, + "loss": 0.0002, + "num_tokens": 2491548.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 156.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0412026047706604, + "kl": 0.0005738437175750732, + "learning_rate": 5.116666666666666e-07, + "loss": 0.0, + "num_tokens": 2491758.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 156.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005234990967437625, + "kl": 0.00123817368876189, + "learning_rate": 5.113333333333334e-07, + "loss": 0.0001, + "num_tokens": 2492038.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 156.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0093684708699584, + "kl": 0.00044766482460545376, + "learning_rate": 5.110000000000001e-07, + "loss": 0.0, + "num_tokens": 2492352.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 156.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07600437104701996, + "kl": 0.008421032456681132, + "learning_rate": 5.106666666666666e-07, + "loss": 0.0004, + "num_tokens": 2492691.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 156.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05454156920313835, + "kl": 0.002588084666058421, + "learning_rate": 5.103333333333333e-07, + "loss": 0.0001, + "num_tokens": 2492963.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 156.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008401497267186642, + "kl": 0.0005766671383753419, + "learning_rate": 5.100000000000001e-07, + "loss": 0.0, + "num_tokens": 2493198.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 156.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23921537399291992, + "kl": 0.02082074456848204, + "learning_rate": 5.096666666666667e-07, + "loss": 0.001, + "num_tokens": 2493490.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 43.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 43.0, + "completions/mean_terminated_length": 43.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 156.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03600386902689934, + "kl": 0.013860939536243677, + "learning_rate": 5.093333333333333e-07, + "loss": 0.0005, + "num_tokens": 2493882.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 156.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04453738406300545, + "kl": 0.011568172369152308, + "learning_rate": 5.09e-07, + "loss": 0.0006, + "num_tokens": 2494204.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 156.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15386320650577545, + "kl": 0.014382415916770697, + "learning_rate": 5.086666666666666e-07, + "loss": 0.0008, + "num_tokens": 2494532.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 156.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0813414454460144, + "kl": 0.018582265824079514, + "learning_rate": 5.083333333333334e-07, + "loss": 0.0009, + "num_tokens": 2494834.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 156.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.37837982177734375, + "kl": 0.033761862374376506, + "learning_rate": 5.08e-07, + "loss": 0.0019, + "num_tokens": 2495118.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 157.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14353086054325104, + "kl": 0.020357735455036163, + "learning_rate": 5.076666666666666e-07, + "loss": 0.0011, + "num_tokens": 2495392.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 157.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02693096548318863, + "kl": 0.0017829835414886475, + "learning_rate": 5.073333333333333e-07, + "loss": 0.0001, + "num_tokens": 2495604.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 157.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000296739861369133, + "kl": 6.8694353103637695e-06, + "learning_rate": 5.070000000000001e-07, + "loss": 0.0, + "num_tokens": 2495824.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 8480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 157.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1642432063817978, + "kl": 0.02829754166305065, + "learning_rate": 5.066666666666667e-07, + "loss": 0.0014, + "num_tokens": 2496126.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 157.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004903525114059448, + "kl": 7.014721632003784e-05, + "learning_rate": 5.063333333333333e-07, + "loss": 0.0, + "num_tokens": 2496338.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 157.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0500800721347332, + "kl": 0.002676622476428747, + "learning_rate": 5.06e-07, + "loss": 0.0001, + "num_tokens": 2496592.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 157.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028845123946666718, + "kl": 0.0053834563586860895, + "learning_rate": 5.056666666666667e-07, + "loss": 0.0002, + "num_tokens": 2496858.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 157.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03952033817768097, + "kl": 0.01759189274162054, + "learning_rate": 5.053333333333334e-07, + "loss": 0.0009, + "num_tokens": 2497157.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 157.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04855850338935852, + "kl": 0.006311023724265397, + "learning_rate": 5.05e-07, + "loss": 0.0003, + "num_tokens": 2497450.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 37.5, + "completions/mean_terminated_length": 37.5, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 157.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1092715710401535, + "kl": 0.042482415214180946, + "learning_rate": 5.046666666666667e-07, + "loss": 0.0021, + "num_tokens": 2497824.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 157.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15142570436000824, + "kl": 0.02115795575082302, + "learning_rate": 5.043333333333333e-07, + "loss": 0.0012, + "num_tokens": 2498109.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 4.5, + "completions/mean_terminated_length": 4.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 157.2037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 14.66716480255127, + "kl": 0.027197793126106262, + "learning_rate": 5.040000000000001e-07, + "loss": 0.1654, + "num_tokens": 2498323.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 8489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 157.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05180661752820015, + "kl": 0.011524613946676254, + "learning_rate": 5.036666666666667e-07, + "loss": 0.0005, + "num_tokens": 2498651.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 157.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023557983338832855, + "kl": 0.0008277259767055511, + "learning_rate": 5.033333333333333e-07, + "loss": 0.0, + "num_tokens": 2498894.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 157.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016277238726615906, + "kl": 0.0005233370466157794, + "learning_rate": 5.03e-07, + "loss": 0.0, + "num_tokens": 2499207.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 157.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14778617024421692, + "kl": 0.008364896522834897, + "learning_rate": 5.026666666666667e-07, + "loss": 0.0004, + "num_tokens": 2499464.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 157.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017961518839001656, + "kl": 0.0011804367823060602, + "learning_rate": 5.023333333333333e-07, + "loss": 0.0001, + "num_tokens": 2499740.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 157.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003330766223371029, + "kl": 0.0003237202763557434, + "learning_rate": 5.02e-07, + "loss": 0.0, + "num_tokens": 2500000.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 157.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1286805421113968, + "kl": 0.03825647942721844, + "learning_rate": 5.016666666666667e-07, + "loss": 0.002, + "num_tokens": 2500269.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 157.35185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6071343421936035, + "kl": 0.07391935959458351, + "learning_rate": 5.013333333333333e-07, + "loss": 0.1767, + "num_tokens": 2500615.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 8497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 157.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00044517318019643426, + "kl": 0.0012253480963408947, + "learning_rate": 5.01e-07, + "loss": 0.0001, + "num_tokens": 2500895.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 157.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04226859286427498, + "kl": 0.00385933555662632, + "learning_rate": 5.006666666666667e-07, + "loss": 0.0002, + "num_tokens": 2501207.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 39.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 45.0, + "completions/max_terminated_length": 45.0, + "completions/mean_length": 39.5, + "completions/mean_terminated_length": 39.5, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 157.40740740740742, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.41400408744812, + "kl": 0.132224190980196, + "learning_rate": 5.003333333333333e-07, + "loss": 0.0363, + "num_tokens": 2501593.0, + "reward": 5.75, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 2.598076105117798, + "step": 8500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 157.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029483066871762276, + "kl": 0.002384966181125492, + "learning_rate": 5e-07, + "loss": 0.0001, + "num_tokens": 2501924.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 157.44444444444446, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1090335845947266, + "kl": 0.1020755278877914, + "learning_rate": 4.996666666666668e-07, + "loss": 0.0041, + "num_tokens": 2502212.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 8502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 157.46296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9743456840515137, + "kl": 0.10830906359478831, + "learning_rate": 4.993333333333333e-07, + "loss": -0.0044, + "num_tokens": 2502554.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 8503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 157.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014132065698504448, + "kl": 0.07349956035614014, + "learning_rate": 4.99e-07, + "loss": 0.0037, + "num_tokens": 2502924.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 157.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020838946104049683, + "kl": 0.2649073898792267, + "learning_rate": 4.986666666666667e-07, + "loss": 0.0132, + "num_tokens": 2503228.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 157.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07360532134771347, + "kl": 0.038996681571006775, + "learning_rate": 4.983333333333333e-07, + "loss": 0.0019, + "num_tokens": 2503540.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 157.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01846860721707344, + "kl": 0.002976749907247722, + "learning_rate": 4.98e-07, + "loss": 0.0002, + "num_tokens": 2503872.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 157.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02639959193766117, + "kl": 0.0014309905236586928, + "learning_rate": 4.976666666666667e-07, + "loss": 0.0001, + "num_tokens": 2504176.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 30.75, + "completions/mean_terminated_length": 30.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 157.57407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.622972011566162, + "kl": 0.09838047996163368, + "learning_rate": 4.973333333333333e-07, + "loss": 0.1487, + "num_tokens": 2504535.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 8509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 157.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02174372412264347, + "kl": 0.0007273862429428846, + "learning_rate": 4.97e-07, + "loss": 0.0, + "num_tokens": 2504797.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 7.5, + "completions/mean_terminated_length": 7.5, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 157.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.416647911071777, + "kl": 2.1433481190761086, + "learning_rate": 4.966666666666667e-07, + "loss": 0.1829, + "num_tokens": 2505035.0, + "reward": 2.5, + "reward_std": 3.0, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 3.0, + "step": 8511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 157.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08385979384183884, + "kl": 0.014134705998003483, + "learning_rate": 4.963333333333333e-07, + "loss": 0.0007, + "num_tokens": 2505361.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 157.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020961308851838112, + "kl": 0.005719877779483795, + "learning_rate": 4.96e-07, + "loss": 0.0003, + "num_tokens": 2505629.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 157.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07840628921985626, + "kl": 0.011885132640600204, + "learning_rate": 4.956666666666667e-07, + "loss": 0.0006, + "num_tokens": 2505935.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 157.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008504621684551239, + "kl": 0.0007483886438421905, + "learning_rate": 4.953333333333333e-07, + "loss": 0.0, + "num_tokens": 2506215.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 40.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 40.5, + "completions/mean_terminated_length": 40.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 157.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12445389479398727, + "kl": 0.023703727638348937, + "learning_rate": 4.95e-07, + "loss": 0.0012, + "num_tokens": 2506597.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 157.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011719525791704655, + "kl": 0.007828783709555864, + "learning_rate": 4.946666666666667e-07, + "loss": 0.0004, + "num_tokens": 2506869.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 35.0, + "completions/mean_terminated_length": 35.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 157.74074074074073, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7693305015563965, + "kl": 0.0615805983543396, + "learning_rate": 4.943333333333334e-07, + "loss": -0.0955, + "num_tokens": 2507225.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 8518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 157.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02838600054383278, + "kl": 0.012475146446377039, + "learning_rate": 4.94e-07, + "loss": 0.0007, + "num_tokens": 2507497.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 157.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10307072103023529, + "kl": 0.019812828861176968, + "learning_rate": 4.936666666666667e-07, + "loss": 0.001, + "num_tokens": 2507787.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 157.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03024272806942463, + "kl": 0.03816715348511934, + "learning_rate": 4.933333333333334e-07, + "loss": 0.0019, + "num_tokens": 2508192.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 8521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 157.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021345676854252815, + "kl": 0.0006064012777642347, + "learning_rate": 4.93e-07, + "loss": 0.0, + "num_tokens": 2508411.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 157.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12507128715515137, + "kl": 0.17312154173851013, + "learning_rate": 4.926666666666667e-07, + "loss": 0.0086, + "num_tokens": 2508721.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 157.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011349762789905071, + "kl": 0.0036967843770980835, + "learning_rate": 4.923333333333333e-07, + "loss": 0.0002, + "num_tokens": 2508957.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 157.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16203093528747559, + "kl": 0.017759868002031, + "learning_rate": 4.92e-07, + "loss": 0.0009, + "num_tokens": 2509285.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 157.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03434457257390022, + "kl": 0.010937778744846582, + "learning_rate": 4.916666666666667e-07, + "loss": 0.0005, + "num_tokens": 2509546.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 157.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006401673890650272, + "kl": 0.0007060051138978451, + "learning_rate": 4.913333333333334e-07, + "loss": 0.0, + "num_tokens": 2509806.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 157.92592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.46056604385376, + "kl": 1.0507410652935505, + "learning_rate": 4.909999999999999e-07, + "loss": 0.0751, + "num_tokens": 2510115.0, + "reward": 2.5, + "reward_std": 3.0, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 3.0, + "step": 8528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 157.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02702411264181137, + "kl": 0.0009557564044371247, + "learning_rate": 4.906666666666667e-07, + "loss": 0.0, + "num_tokens": 2510411.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 157.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02450629696249962, + "kl": 0.0019890672992914915, + "learning_rate": 4.903333333333334e-07, + "loss": 0.0001, + "num_tokens": 2510684.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 157.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0746566578745842, + "kl": 0.015292820055037737, + "learning_rate": 4.9e-07, + "loss": 0.0008, + "num_tokens": 2510967.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 158.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06476012617349625, + "kl": 0.0033435896039009094, + "learning_rate": 4.896666666666666e-07, + "loss": 0.0002, + "num_tokens": 2511183.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 37.75, + "completions/mean_terminated_length": 37.75, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 158.0185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4616315364837646, + "kl": 0.100852370262146, + "learning_rate": 4.893333333333334e-07, + "loss": 0.0126, + "num_tokens": 2511562.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 8533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 158.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0408039428293705, + "kl": 0.0022240668768063188, + "learning_rate": 4.89e-07, + "loss": 0.0001, + "num_tokens": 2511834.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 30.25, + "completions/mean_terminated_length": 30.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 158.05555555555554, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.360039710998535, + "kl": 0.0472066942602396, + "learning_rate": 4.886666666666667e-07, + "loss": -0.1574, + "num_tokens": 2512183.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 8535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 158.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0031456961296498775, + "kl": 0.0016734758391976357, + "learning_rate": 4.883333333333334e-07, + "loss": 0.0001, + "num_tokens": 2512495.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 158.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02307688631117344, + "kl": 0.001133069396018982, + "learning_rate": 4.879999999999999e-07, + "loss": 0.0, + "num_tokens": 2512711.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 158.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024726692587137222, + "kl": 0.00523054925724864, + "learning_rate": 4.876666666666667e-07, + "loss": 0.0003, + "num_tokens": 2512989.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 158.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00701464107260108, + "kl": 0.0006848298653494567, + "learning_rate": 4.873333333333334e-07, + "loss": 0.0, + "num_tokens": 2513273.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 158.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08706600219011307, + "kl": 0.03382064402103424, + "learning_rate": 4.87e-07, + "loss": 0.0017, + "num_tokens": 2513570.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 158.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07695037126541138, + "kl": 0.0071961539797484875, + "learning_rate": 4.866666666666666e-07, + "loss": 0.0004, + "num_tokens": 2513789.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 158.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.12135442881845e-05, + "kl": 2.332031726837158e-06, + "learning_rate": 4.863333333333334e-07, + "loss": 0.0, + "num_tokens": 2514009.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 8542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 158.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08905328065156937, + "kl": 0.019308204296976328, + "learning_rate": 4.86e-07, + "loss": 0.001, + "num_tokens": 2514295.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 158.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.437072277069092, + "kl": 0.42022114992141724, + "learning_rate": 4.856666666666667e-07, + "loss": 0.028, + "num_tokens": 2514535.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 158.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01061983872205019, + "kl": 0.00872632977552712, + "learning_rate": 4.853333333333333e-07, + "loss": 0.0004, + "num_tokens": 2514809.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 158.25925925925927, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9914193153381348, + "kl": 0.006183861289173365, + "learning_rate": 4.85e-07, + "loss": -0.0005, + "num_tokens": 2515097.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 8546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 158.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13980522751808167, + "kl": 0.009371042484417558, + "learning_rate": 4.846666666666667e-07, + "loss": 0.0005, + "num_tokens": 2515362.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 158.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0005353864398784935, + "kl": 0.0012410475173965096, + "learning_rate": 4.843333333333334e-07, + "loss": 0.0001, + "num_tokens": 2515642.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 158.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028003400191664696, + "kl": 0.0005597322960966267, + "learning_rate": 4.84e-07, + "loss": 0.0, + "num_tokens": 2515898.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 158.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01123598963022232, + "kl": 0.0010447597014717758, + "learning_rate": 4.836666666666666e-07, + "loss": 0.0001, + "num_tokens": 2516166.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 158.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022090567275881767, + "kl": 0.2647307515144348, + "learning_rate": 4.833333333333334e-07, + "loss": 0.0132, + "num_tokens": 2516470.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 158.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029005344957113266, + "kl": 0.03795054741203785, + "learning_rate": 4.830000000000001e-07, + "loss": 0.0019, + "num_tokens": 2516875.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 8552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 158.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02571025677025318, + "kl": 0.0015140259929466993, + "learning_rate": 4.826666666666666e-07, + "loss": 0.0001, + "num_tokens": 2517197.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 158.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036035045981407166, + "kl": 0.0022087815450504422, + "learning_rate": 4.823333333333333e-07, + "loss": 0.0001, + "num_tokens": 2517473.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 158.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024019135162234306, + "kl": 0.002115154347848147, + "learning_rate": 4.82e-07, + "loss": 0.0001, + "num_tokens": 2517797.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 158.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004839766770601273, + "kl": 0.0002545595198171213, + "learning_rate": 4.816666666666667e-07, + "loss": 0.0, + "num_tokens": 2518017.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 158.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023963728919625282, + "kl": 0.0013586296408902854, + "learning_rate": 4.813333333333334e-07, + "loss": 0.0001, + "num_tokens": 2518252.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 27.5, + "completions/mean_terminated_length": 27.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 158.4814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8045413494110107, + "kl": 0.10301024094223976, + "learning_rate": 4.81e-07, + "loss": 0.0596, + "num_tokens": 2518598.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 8558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 158.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20957903563976288, + "kl": 0.03722521383315325, + "learning_rate": 4.806666666666666e-07, + "loss": 0.0026, + "num_tokens": 2518876.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 24.25, + "completions/mean_terminated_length": 24.25, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 158.5185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.066530704498291, + "kl": 0.047671230509877205, + "learning_rate": 4.803333333333334e-07, + "loss": 0.0623, + "num_tokens": 2519193.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 8560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 158.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0672617256641388, + "kl": 0.00913128606043756, + "learning_rate": 4.800000000000001e-07, + "loss": 0.0005, + "num_tokens": 2519499.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 158.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16807205975055695, + "kl": 0.02666935371235013, + "learning_rate": 4.796666666666666e-07, + "loss": 0.0014, + "num_tokens": 2519792.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 158.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044456418603658676, + "kl": 0.004684043116867542, + "learning_rate": 4.793333333333333e-07, + "loss": 0.0002, + "num_tokens": 2520060.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 158.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09888976812362671, + "kl": 0.01866341568529606, + "learning_rate": 4.790000000000001e-07, + "loss": 0.0009, + "num_tokens": 2520389.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 158.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02013307251036167, + "kl": 0.011850446462631226, + "learning_rate": 4.786666666666667e-07, + "loss": 0.0006, + "num_tokens": 2520649.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 158.62962962962962, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.201779842376709, + "kl": 0.22302223520819098, + "learning_rate": 4.783333333333333e-07, + "loss": -0.0886, + "num_tokens": 2520911.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 8566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 158.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001245591207407415, + "kl": 2.7805566787719727e-05, + "learning_rate": 4.78e-07, + "loss": 0.0, + "num_tokens": 2521123.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 158.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010588799603283405, + "kl": 0.0011360293137840927, + "learning_rate": 4.776666666666666e-07, + "loss": 0.0001, + "num_tokens": 2521419.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 158.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038274750113487244, + "kl": 0.0065190650057047606, + "learning_rate": 4.773333333333334e-07, + "loss": 0.0003, + "num_tokens": 2521713.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 36.0, + "completions/mean_terminated_length": 36.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 158.7037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7623893022537231, + "kl": 0.08423770777881145, + "learning_rate": 4.77e-07, + "loss": 0.0671, + "num_tokens": 2522081.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 8570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 158.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8867781162261963, + "kl": 0.11864447966217995, + "learning_rate": 4.7666666666666667e-07, + "loss": 0.0059, + "num_tokens": 2522381.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 158.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3298551142215729, + "kl": 0.014585547847673297, + "learning_rate": 4.763333333333333e-07, + "loss": 0.0007, + "num_tokens": 2522710.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 158.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06043326482176781, + "kl": 0.012919301632791758, + "learning_rate": 4.7600000000000003e-07, + "loss": 0.0005, + "num_tokens": 2523024.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 158.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008054965175688267, + "kl": 0.0014065116411074996, + "learning_rate": 4.756666666666667e-07, + "loss": 0.0001, + "num_tokens": 2523284.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 2.5, + "completions/mean_terminated_length": 2.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 158.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022008635103702545, + "kl": 0.000444166362285614, + "learning_rate": 4.7533333333333333e-07, + "loss": 0.0, + "num_tokens": 2523490.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 158.8148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5160182118415833, + "kl": 0.42667537182569504, + "learning_rate": 4.75e-07, + "loss": 0.0007, + "num_tokens": 2523860.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 8576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 158.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10601342469453812, + "kl": 0.030844644643366337, + "learning_rate": 4.7466666666666663e-07, + "loss": 0.0017, + "num_tokens": 2524177.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 158.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06133238226175308, + "kl": 0.0183323142118752, + "learning_rate": 4.7433333333333336e-07, + "loss": 0.0009, + "num_tokens": 2524446.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 158.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00484099518507719, + "kl": 0.00039897486567497253, + "learning_rate": 4.7400000000000004e-07, + "loss": 0.0, + "num_tokens": 2524706.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 10.75, + "completions/mean_terminated_length": 10.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 158.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060721710324287415, + "kl": 0.0049377307295799255, + "learning_rate": 4.7366666666666666e-07, + "loss": 0.0003, + "num_tokens": 2524961.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 158.90740740740742, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.252732276916504, + "kl": 0.09712276276695775, + "learning_rate": 4.7333333333333334e-07, + "loss": 0.1094, + "num_tokens": 2525282.0, + "reward": 6.75, + "reward_std": 2.5, + "rewards/reward_combined/mean": 6.75, + "rewards/reward_combined/std": 2.5, + "step": 8581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 37.5, + "completions/mean_terminated_length": 37.5, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 158.92592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.158277750015259, + "kl": 0.05956242233514786, + "learning_rate": 4.7300000000000007e-07, + "loss": -0.0109, + "num_tokens": 2525648.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 8582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 158.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03432878479361534, + "kl": 0.007546215783804655, + "learning_rate": 4.726666666666667e-07, + "loss": 0.0004, + "num_tokens": 2525938.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 158.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027142107486724854, + "kl": 0.007473616395145655, + "learning_rate": 4.723333333333333e-07, + "loss": 0.0004, + "num_tokens": 2526230.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 158.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008273106999695301, + "kl": 0.00010346919589210302, + "learning_rate": 4.72e-07, + "loss": 0.0, + "num_tokens": 2526498.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 159.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6882164478302, + "kl": 0.15949007868766785, + "learning_rate": 4.716666666666667e-07, + "loss": 0.2016, + "num_tokens": 2526828.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 8586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.5, + "completions/mean_terminated_length": 5.5, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 159.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005431402008980513, + "kl": 0.00021148494124645367, + "learning_rate": 4.7133333333333335e-07, + "loss": 0.0, + "num_tokens": 2527050.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 159.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13048556447029114, + "kl": 0.012707748916000128, + "learning_rate": 4.71e-07, + "loss": 0.0006, + "num_tokens": 2527374.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 159.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02913927286863327, + "kl": 0.006163935177028179, + "learning_rate": 4.7066666666666665e-07, + "loss": 0.0003, + "num_tokens": 2527664.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 159.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3855316936969757, + "kl": 0.06308460980653763, + "learning_rate": 4.703333333333333e-07, + "loss": 0.0032, + "num_tokens": 2527998.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 159.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0027163501363247633, + "kl": 0.00025666777219157666, + "learning_rate": 4.7000000000000005e-07, + "loss": 0.0, + "num_tokens": 2528260.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 159.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005124661140143871, + "kl": 0.0015183575451374054, + "learning_rate": 4.696666666666667e-07, + "loss": 0.0001, + "num_tokens": 2528572.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 159.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007446943782269955, + "kl": 0.0014121129643172026, + "learning_rate": 4.6933333333333335e-07, + "loss": 0.0001, + "num_tokens": 2528849.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 159.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056365855038166046, + "kl": 0.04130646586418152, + "learning_rate": 4.69e-07, + "loss": 0.0021, + "num_tokens": 2529147.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 159.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036859773099422455, + "kl": 0.004151446162723005, + "learning_rate": 4.686666666666667e-07, + "loss": 0.0002, + "num_tokens": 2529438.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 159.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06409481167793274, + "kl": 0.010138588957488537, + "learning_rate": 4.683333333333334e-07, + "loss": 0.0005, + "num_tokens": 2529754.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 159.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024869225919246674, + "kl": 0.00039105117321014404, + "learning_rate": 4.68e-07, + "loss": 0.0, + "num_tokens": 2529967.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 159.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022369252517819405, + "kl": 0.011333707720041275, + "learning_rate": 4.676666666666667e-07, + "loss": 0.0006, + "num_tokens": 2530227.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 159.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020912325009703636, + "kl": 0.005029738647863269, + "learning_rate": 4.673333333333333e-07, + "loss": 0.0003, + "num_tokens": 2530564.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 159.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023824648931622505, + "kl": 0.1605878844857216, + "learning_rate": 4.6700000000000004e-07, + "loss": 0.008, + "num_tokens": 2530874.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 159.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023516716435551643, + "kl": 0.004270400386303663, + "learning_rate": 4.6666666666666666e-07, + "loss": 0.0002, + "num_tokens": 2531142.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 37.25, + "completions/mean_terminated_length": 37.25, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 159.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035752128809690475, + "kl": 0.026269923895597458, + "learning_rate": 4.6633333333333334e-07, + "loss": 0.0013, + "num_tokens": 2531515.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 159.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03827158734202385, + "kl": 0.00623775040730834, + "learning_rate": 4.6599999999999996e-07, + "loss": 0.0003, + "num_tokens": 2531816.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 159.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06170351058244705, + "kl": 0.009488976560533047, + "learning_rate": 4.656666666666667e-07, + "loss": 0.0005, + "num_tokens": 2532145.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 159.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009644703241065145, + "kl": 6.021261287969537e-05, + "learning_rate": 4.6533333333333337e-07, + "loss": 0.0, + "num_tokens": 2532401.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 159.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00020436133490875363, + "kl": 4.559755325317383e-06, + "learning_rate": 4.65e-07, + "loss": 0.0, + "num_tokens": 2532621.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 8606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 30.5, + "completions/mean_terminated_length": 30.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 159.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.36243778467178345, + "kl": 0.062055718153715134, + "learning_rate": 4.6466666666666667e-07, + "loss": 0.0029, + "num_tokens": 2532963.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 159.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.9392231106758118, + "kl": 0.14744428172707558, + "learning_rate": 4.643333333333333e-07, + "loss": 0.0101, + "num_tokens": 2533274.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.75, + "completions/mean_terminated_length": 10.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 159.42592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8875181674957275, + "kl": 0.006092198193073273, + "learning_rate": 4.64e-07, + "loss": 0.0254, + "num_tokens": 2533545.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 8609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 159.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08360299468040466, + "kl": 0.024957576766610146, + "learning_rate": 4.636666666666667e-07, + "loss": 0.0013, + "num_tokens": 2533854.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 159.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13966958224773407, + "kl": 0.04225391894578934, + "learning_rate": 4.6333333333333333e-07, + "loss": 0.0021, + "num_tokens": 2534191.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 35.5, + "completions/mean_terminated_length": 35.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 159.4814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8296147584915161, + "kl": 0.14229894056916237, + "learning_rate": 4.63e-07, + "loss": 0.0817, + "num_tokens": 2534613.0, + "reward": 2.174999952316284, + "reward_std": 1.1786290407180786, + "rewards/reward_combined/mean": 2.174999952316284, + "rewards/reward_combined/std": 1.1786291599273682, + "step": 8612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 159.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02089979499578476, + "kl": 0.0007220010011224076, + "learning_rate": 4.6266666666666673e-07, + "loss": 0.0, + "num_tokens": 2534829.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.75, + "completions/mean_terminated_length": 12.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 159.5185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.557019233703613, + "kl": 0.11554973479360342, + "learning_rate": 4.6233333333333336e-07, + "loss": 0.0259, + "num_tokens": 2535104.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 8614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 159.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03735779598355293, + "kl": 0.002009578049182892, + "learning_rate": 4.62e-07, + "loss": 0.0001, + "num_tokens": 2535348.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 159.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02146579697728157, + "kl": 0.26481232047080994, + "learning_rate": 4.6166666666666666e-07, + "loss": 0.0132, + "num_tokens": 2535652.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 159.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04026374593377113, + "kl": 0.0018839865952031687, + "learning_rate": 4.613333333333334e-07, + "loss": 0.0001, + "num_tokens": 2535950.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 5.75, + "completions/mean_terminated_length": 5.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 159.59259259259258, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.249157905578613, + "kl": 0.33205313235521317, + "learning_rate": 4.61e-07, + "loss": 0.2582, + "num_tokens": 2536173.0, + "reward": 3.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 3.875, + "rewards/reward_combined/std": 0.25, + "step": 8618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 2.5, + "completions/mean_terminated_length": 2.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 159.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0649443045258522, + "kl": 0.0010302364826202393, + "learning_rate": 4.606666666666667e-07, + "loss": 0.0001, + "num_tokens": 2536379.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 159.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0019475027220323682, + "kl": 0.003537513315677643, + "learning_rate": 4.603333333333333e-07, + "loss": 0.0002, + "num_tokens": 2536615.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 159.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019151508808135986, + "kl": 0.0006709507724735886, + "learning_rate": 4.6e-07, + "loss": 0.0, + "num_tokens": 2536884.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 159.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007090226747095585, + "kl": 0.00010500549979042262, + "learning_rate": 4.596666666666667e-07, + "loss": 0.0, + "num_tokens": 2537154.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 159.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01995955966413021, + "kl": 0.0008948360045906156, + "learning_rate": 4.5933333333333334e-07, + "loss": 0.0, + "num_tokens": 2537434.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 84.0, + "completions/max_terminated_length": 84.0, + "completions/mean_length": 34.25, + "completions/mean_terminated_length": 34.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 159.7037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.824875831604004, + "kl": 0.0568277578568086, + "learning_rate": 4.59e-07, + "loss": 0.353, + "num_tokens": 2537811.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 8624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 159.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0163147933781147, + "kl": 0.0007652853382751346, + "learning_rate": 4.5866666666666664e-07, + "loss": 0.0, + "num_tokens": 2538046.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 159.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0985773429274559, + "kl": 0.006719287019222975, + "learning_rate": 4.583333333333334e-07, + "loss": 0.0003, + "num_tokens": 2538307.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 159.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08132348954677582, + "kl": 0.0070319268852472305, + "learning_rate": 4.5800000000000005e-07, + "loss": 0.0004, + "num_tokens": 2538605.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 159.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19190256297588348, + "kl": 0.03531090263277292, + "learning_rate": 4.576666666666667e-07, + "loss": 0.0017, + "num_tokens": 2538890.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 159.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02645549736917019, + "kl": 0.008679892867803574, + "learning_rate": 4.573333333333333e-07, + "loss": 0.0004, + "num_tokens": 2539182.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 159.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004941096995025873, + "kl": 0.00041915103793144226, + "learning_rate": 4.57e-07, + "loss": 0.0, + "num_tokens": 2539442.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 159.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08969242125749588, + "kl": 0.017657252494245768, + "learning_rate": 4.566666666666667e-07, + "loss": 0.0009, + "num_tokens": 2539728.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8631 + }, + { + "clip_ratio/high_max": 0.0055555556900799274, + "clip_ratio/high_mean": 0.0055555556900799274, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0055555556900799274, + "completion_length": 41.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 47.0, + "completions/max_terminated_length": 47.0, + "completions/mean_length": 41.5, + "completions/mean_terminated_length": 41.5, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 159.85185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.9159317016601562, + "kl": 0.10721618309617043, + "learning_rate": 4.5633333333333333e-07, + "loss": 0.0006, + "num_tokens": 2540110.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 8632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 159.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036441702395677567, + "kl": 0.030656098388135433, + "learning_rate": 4.56e-07, + "loss": 0.0015, + "num_tokens": 2540384.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 159.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025416595861315727, + "kl": 0.09495680779218674, + "learning_rate": 4.5566666666666663e-07, + "loss": 0.0047, + "num_tokens": 2540756.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 39.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.0, + "completions/max_terminated_length": 46.0, + "completions/mean_length": 39.5, + "completions/mean_terminated_length": 39.5, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 159.90740740740742, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.351839303970337, + "kl": 0.1992230974137783, + "learning_rate": 4.5533333333333336e-07, + "loss": 0.0441, + "num_tokens": 2541142.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 8635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 159.92592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3036472797393799, + "kl": 0.10299092531204224, + "learning_rate": 4.5500000000000004e-07, + "loss": 0.0266, + "num_tokens": 2541480.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 8636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 159.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02982376702129841, + "kl": 0.0021069254144094884, + "learning_rate": 4.5466666666666666e-07, + "loss": 0.0001, + "num_tokens": 2541809.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 159.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04965667426586151, + "kl": 0.012602425646036863, + "learning_rate": 4.5433333333333334e-07, + "loss": 0.0006, + "num_tokens": 2542113.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 159.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03606492653489113, + "kl": 0.005830902606248856, + "learning_rate": 4.5399999999999996e-07, + "loss": 0.0003, + "num_tokens": 2542399.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 9.75, + "completions/mean_terminated_length": 9.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 160.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07060271501541138, + "kl": 0.003959645750001073, + "learning_rate": 4.536666666666667e-07, + "loss": 0.0002, + "num_tokens": 2542658.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 160.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0037361718714237213, + "kl": 0.0019822437316179276, + "learning_rate": 4.5333333333333337e-07, + "loss": 0.0001, + "num_tokens": 2542970.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8641 + }, + { + "clip_ratio/high_max": 0.013513513840734959, + "clip_ratio/high_mean": 0.013513513840734959, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.013513513840734959, + "completion_length": 37.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 37.5, + "completions/mean_terminated_length": 37.5, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 160.03703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3153436183929443, + "kl": 0.09401218220591545, + "learning_rate": 4.53e-07, + "loss": -0.0555, + "num_tokens": 2543348.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 8642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 160.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02069438062608242, + "kl": 0.0004439055919647217, + "learning_rate": 4.5266666666666667e-07, + "loss": 0.0, + "num_tokens": 2543558.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 160.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0434831865131855, + "kl": 0.004447659943252802, + "learning_rate": 4.523333333333334e-07, + "loss": 0.0002, + "num_tokens": 2543850.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 160.09259259259258, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.218278169631958, + "kl": 0.08415662194602191, + "learning_rate": 4.52e-07, + "loss": -0.0122, + "num_tokens": 2544143.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 8645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 9.75, + "completions/mean_terminated_length": 9.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 160.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022728722542524338, + "kl": 0.0007326866034418344, + "learning_rate": 4.5166666666666665e-07, + "loss": 0.0, + "num_tokens": 2544394.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 160.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004622384440153837, + "kl": 0.00038431957364082336, + "learning_rate": 4.513333333333333e-07, + "loss": 0.0, + "num_tokens": 2544654.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 160.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001387011376209557, + "kl": 3.723055124282837e-05, + "learning_rate": 4.5100000000000005e-07, + "loss": 0.0, + "num_tokens": 2544866.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 160.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02367924153804779, + "kl": 0.0006719735392834991, + "learning_rate": 4.506666666666667e-07, + "loss": 0.0, + "num_tokens": 2545100.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 160.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05397434160113335, + "kl": 0.16236093640327454, + "learning_rate": 4.5033333333333336e-07, + "loss": 0.0081, + "num_tokens": 2545410.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 160.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03227872774004936, + "kl": 0.002121766214258969, + "learning_rate": 4.5e-07, + "loss": 0.0001, + "num_tokens": 2545683.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 160.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024951478466391563, + "kl": 0.0016541053773835301, + "learning_rate": 4.4966666666666666e-07, + "loss": 0.0001, + "num_tokens": 2545957.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 160.24074074074073, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.1402907371521, + "kl": 0.019108325242996216, + "learning_rate": 4.493333333333334e-07, + "loss": 0.122, + "num_tokens": 2546249.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 8653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 26.25, + "completions/mean_terminated_length": 26.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 160.25925925925927, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.882719874382019, + "kl": 0.14848625287413597, + "learning_rate": 4.49e-07, + "loss": 0.0184, + "num_tokens": 2546590.0, + "reward": 6.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 6.5, + "rewards/reward_combined/std": 2.0, + "step": 8654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 160.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00010141506209038198, + "kl": 2.346932888031006e-06, + "learning_rate": 4.486666666666667e-07, + "loss": 0.0, + "num_tokens": 2546810.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 8655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 160.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022499442100524902, + "kl": 0.007729256059974432, + "learning_rate": 4.483333333333333e-07, + "loss": 0.0004, + "num_tokens": 2547104.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 160.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0021007382310926914, + "kl": 4.374980926513672e-05, + "learning_rate": 4.4800000000000004e-07, + "loss": 0.0, + "num_tokens": 2547360.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 160.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035914354026317596, + "kl": 0.007574115530587733, + "learning_rate": 4.476666666666667e-07, + "loss": 0.0004, + "num_tokens": 2547703.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 160.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0019422711338847876, + "kl": 0.0035448744893074036, + "learning_rate": 4.4733333333333334e-07, + "loss": 0.0002, + "num_tokens": 2547939.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 160.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04819255694746971, + "kl": 0.02114738430827856, + "learning_rate": 4.4699999999999997e-07, + "loss": 0.0012, + "num_tokens": 2548219.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 160.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007517922669649124, + "kl": 0.0014330648118630052, + "learning_rate": 4.4666666666666664e-07, + "loss": 0.0001, + "num_tokens": 2548496.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 27.0, + "completions/mean_terminated_length": 27.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 160.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1201191395521164, + "kl": 0.040469877421855927, + "learning_rate": 4.4633333333333337e-07, + "loss": 0.002, + "num_tokens": 2548856.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 160.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018833497539162636, + "kl": 0.0008993387164082378, + "learning_rate": 4.46e-07, + "loss": 0.0, + "num_tokens": 2549184.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 160.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13987883925437927, + "kl": 0.027296412270516157, + "learning_rate": 4.456666666666667e-07, + "loss": 0.0014, + "num_tokens": 2549462.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 61.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 212.0, + "completions/max_terminated_length": 212.0, + "completions/mean_length": 61.75, + "completions/mean_terminated_length": 61.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 160.46296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8982056379318237, + "kl": 0.011345220729708672, + "learning_rate": 4.453333333333333e-07, + "loss": 0.4524, + "num_tokens": 2549933.0, + "reward": 6.050000190734863, + "reward_std": 3.9000003337860107, + "rewards/reward_combined/mean": 6.050000190734863, + "rewards/reward_combined/std": 3.9000000953674316, + "step": 8665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 160.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02985193394124508, + "kl": 0.004056473029777408, + "learning_rate": 4.4500000000000003e-07, + "loss": 0.0002, + "num_tokens": 2550262.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 76.75, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 76.75, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 160.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0085318088531494, + "kl": 0.048484018072485924, + "learning_rate": 4.446666666666667e-07, + "loss": 0.4558, + "num_tokens": 2550789.0, + "reward": 5.675000190734863, + "reward_std": 3.6499998569488525, + "rewards/reward_combined/mean": 5.675000190734863, + "rewards/reward_combined/std": 3.6500000953674316, + "step": 8667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 160.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06846870481967926, + "kl": 0.010346206836402416, + "learning_rate": 4.4433333333333333e-07, + "loss": 0.0005, + "num_tokens": 2551078.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 160.53703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.8624725341796875, + "kl": 0.14514271169900894, + "learning_rate": 4.44e-07, + "loss": -0.2538, + "num_tokens": 2551381.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 8669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 160.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03502403572201729, + "kl": 0.004048785107443109, + "learning_rate": 4.4366666666666663e-07, + "loss": 0.0002, + "num_tokens": 2551639.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 160.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07920677214860916, + "kl": 0.008172958623617887, + "learning_rate": 4.4333333333333336e-07, + "loss": 0.0004, + "num_tokens": 2551944.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 40.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 40.25, + "completions/mean_terminated_length": 40.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 160.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032318998128175735, + "kl": 0.00595466373488307, + "learning_rate": 4.4300000000000004e-07, + "loss": 0.0003, + "num_tokens": 2552325.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 160.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020329784601926804, + "kl": 0.006174417831061874, + "learning_rate": 4.4266666666666666e-07, + "loss": 0.0003, + "num_tokens": 2552597.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 160.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08013773709535599, + "kl": 0.00739249074831605, + "learning_rate": 4.4233333333333334e-07, + "loss": 0.0004, + "num_tokens": 2552816.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 38.25, + "completions/mean_terminated_length": 38.25, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 160.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09457864612340927, + "kl": 0.042728934437036514, + "learning_rate": 4.4200000000000007e-07, + "loss": 0.002, + "num_tokens": 2553193.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 160.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.25403594970703125, + "kl": 0.040088089561322704, + "learning_rate": 4.416666666666667e-07, + "loss": 0.0023, + "num_tokens": 2553457.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 160.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06741530448198318, + "kl": 0.007070574734825641, + "learning_rate": 4.413333333333333e-07, + "loss": 0.0004, + "num_tokens": 2553755.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 160.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010327140800654888, + "kl": 0.0006848268094472587, + "learning_rate": 4.41e-07, + "loss": 0.0, + "num_tokens": 2554041.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 160.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016408642986789346, + "kl": 0.00011049045861000195, + "learning_rate": 4.406666666666667e-07, + "loss": 0.0, + "num_tokens": 2554353.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 160.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021787144243717194, + "kl": 0.0009699314832687378, + "learning_rate": 4.4033333333333335e-07, + "loss": 0.0, + "num_tokens": 2554565.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 160.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04410265013575554, + "kl": 0.014314462430775166, + "learning_rate": 4.4e-07, + "loss": 0.0007, + "num_tokens": 2554869.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 26.75, + "completions/mean_terminated_length": 26.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 160.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04200639948248863, + "kl": 0.011541795916855335, + "learning_rate": 4.3966666666666665e-07, + "loss": 0.0006, + "num_tokens": 2555204.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0018939394503831863, + "clip_ratio/low_min": 0.0018939394503831863, + "clip_ratio/region_mean": 0.0018939394503831863, + "completion_length": 70.5, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 70.5, + "completions/mean_terminated_length": 8.666666984558105, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 160.7962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.058505058288574, + "kl": 0.003887388505972922, + "learning_rate": 4.393333333333333e-07, + "loss": 0.47, + "num_tokens": 2555706.0, + "reward": 7.300000190734863, + "reward_std": 0.40000009536743164, + "rewards/reward_combined/mean": 7.300000190734863, + "rewards/reward_combined/std": 0.40000009536743164, + "step": 8683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.5, + "completions/mean_terminated_length": 31.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 160.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02446591481566429, + "kl": 0.03118347004055977, + "learning_rate": 4.3900000000000005e-07, + "loss": 0.0016, + "num_tokens": 2556112.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 8684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 160.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6730837225914001, + "kl": 0.06484004156664014, + "learning_rate": 4.386666666666667e-07, + "loss": 0.0029, + "num_tokens": 2556408.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 37.25, + "completions/mean_terminated_length": 37.25, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 160.85185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0480692386627197, + "kl": 0.057574307546019554, + "learning_rate": 4.3833333333333335e-07, + "loss": -0.0269, + "num_tokens": 2556773.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 8686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 160.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021530449390411377, + "kl": 0.264899879693985, + "learning_rate": 4.38e-07, + "loss": 0.0132, + "num_tokens": 2557077.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 160.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004730751272290945, + "kl": 0.00020885467529296875, + "learning_rate": 4.376666666666667e-07, + "loss": 0.0, + "num_tokens": 2557297.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 160.90740740740742, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4174129962921143, + "kl": 0.07839278131723404, + "learning_rate": 4.373333333333334e-07, + "loss": -0.0481, + "num_tokens": 2557599.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 8689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 160.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1381855010986328, + "kl": 0.013612152077257633, + "learning_rate": 4.37e-07, + "loss": 0.0009, + "num_tokens": 2557941.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 160.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08004219084978104, + "kl": 0.010712900198996067, + "learning_rate": 4.3666666666666663e-07, + "loss": 0.0006, + "num_tokens": 2558261.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 160.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027102116495370865, + "kl": 0.0008121976570691913, + "learning_rate": 4.363333333333333e-07, + "loss": 0.0, + "num_tokens": 2558528.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 160.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020501334220170975, + "kl": 0.011936224065721035, + "learning_rate": 4.3600000000000004e-07, + "loss": 0.0006, + "num_tokens": 2558788.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 26.25, + "completions/mean_terminated_length": 26.25, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 161.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028638113290071487, + "kl": 0.09418239071965218, + "learning_rate": 4.3566666666666666e-07, + "loss": 0.0047, + "num_tokens": 2559161.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 161.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039101943373680115, + "kl": 0.0010855465952772647, + "learning_rate": 4.3533333333333334e-07, + "loss": 0.0001, + "num_tokens": 2559417.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 161.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009871583431959152, + "kl": 0.0007590443128719926, + "learning_rate": 4.3499999999999996e-07, + "loss": 0.0, + "num_tokens": 2559677.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 161.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02007290907204151, + "kl": 0.0029251491650938988, + "learning_rate": 4.346666666666667e-07, + "loss": 0.0001, + "num_tokens": 2559989.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 161.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004962783306837082, + "kl": 5.539953781408258e-05, + "learning_rate": 4.3433333333333337e-07, + "loss": 0.0, + "num_tokens": 2560202.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 161.09259259259258, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.6566057205200195, + "kl": 0.04266488179564476, + "learning_rate": 4.34e-07, + "loss": 0.078, + "num_tokens": 2560487.0, + "reward": 7.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.875, + "rewards/reward_combined/std": 0.25, + "step": 8699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 161.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5060837864875793, + "kl": 0.03299644310027361, + "learning_rate": 4.3366666666666667e-07, + "loss": 0.0022, + "num_tokens": 2560769.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 161.12962962962962, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.2146897315979, + "kl": 0.03656699322164059, + "learning_rate": 4.333333333333333e-07, + "loss": 0.0013, + "num_tokens": 2561124.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 8701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 161.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05218846723437309, + "kl": 0.0114898094907403, + "learning_rate": 4.33e-07, + "loss": 0.0006, + "num_tokens": 2561449.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 161.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15996979176998138, + "kl": 0.0243934728205204, + "learning_rate": 4.326666666666667e-07, + "loss": 0.0012, + "num_tokens": 2561758.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 161.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029595768079161644, + "kl": 0.09536663070321083, + "learning_rate": 4.3233333333333333e-07, + "loss": 0.0048, + "num_tokens": 2562130.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 161.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045077454298734665, + "kl": 0.0015447183977812529, + "learning_rate": 4.32e-07, + "loss": 0.0001, + "num_tokens": 2562394.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 161.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006517356378026307, + "kl": 0.0012625583331100643, + "learning_rate": 4.3166666666666673e-07, + "loss": 0.0001, + "num_tokens": 2562674.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 161.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033829428255558014, + "kl": 0.001004789024591446, + "learning_rate": 4.3133333333333336e-07, + "loss": 0.0001, + "num_tokens": 2562934.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 161.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05706070363521576, + "kl": 0.004463211516849697, + "learning_rate": 4.31e-07, + "loss": 0.0002, + "num_tokens": 2563259.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 161.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05499446019530296, + "kl": 0.013950803317129612, + "learning_rate": 4.3066666666666666e-07, + "loss": 0.0008, + "num_tokens": 2563533.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 161.2962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.846357822418213, + "kl": 0.017044548789272085, + "learning_rate": 4.303333333333334e-07, + "loss": 0.1061, + "num_tokens": 2563835.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 8710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 161.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17474430799484253, + "kl": 0.04627208597958088, + "learning_rate": 4.3e-07, + "loss": 0.0024, + "num_tokens": 2564126.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.75, + "completions/mean_terminated_length": 12.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 161.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.30064892768859863, + "kl": 0.022759624291211367, + "learning_rate": 4.296666666666667e-07, + "loss": 0.0013, + "num_tokens": 2564405.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 161.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05347103998064995, + "kl": 0.0051518643740564585, + "learning_rate": 4.293333333333333e-07, + "loss": 0.0003, + "num_tokens": 2564735.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 161.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03452187776565552, + "kl": 0.033436816185712814, + "learning_rate": 4.29e-07, + "loss": 0.0017, + "num_tokens": 2565140.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 8714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 161.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09435445070266724, + "kl": 0.01236627995967865, + "learning_rate": 4.286666666666667e-07, + "loss": 0.0006, + "num_tokens": 2565479.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 161.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031507983803749084, + "kl": 0.001063937903381884, + "learning_rate": 4.2833333333333334e-07, + "loss": 0.0001, + "num_tokens": 2565755.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 40.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 45.0, + "completions/max_terminated_length": 45.0, + "completions/mean_length": 40.0, + "completions/mean_terminated_length": 40.0, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 161.42592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0394740104675293, + "kl": 0.15030419826507568, + "learning_rate": 4.28e-07, + "loss": 0.051, + "num_tokens": 2566131.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 8717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 161.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1364179253578186, + "kl": 0.017006624955683947, + "learning_rate": 4.2766666666666664e-07, + "loss": 0.0008, + "num_tokens": 2566422.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 37.25, + "completions/mean_terminated_length": 37.25, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 161.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09620498865842819, + "kl": 0.0694228857755661, + "learning_rate": 4.273333333333334e-07, + "loss": 0.0035, + "num_tokens": 2566799.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 161.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1384589523077011, + "kl": 0.014049086938030086, + "learning_rate": 4.2700000000000005e-07, + "loss": 0.001, + "num_tokens": 2567041.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 33.25, + "completions/mean_terminated_length": 33.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 161.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9725539684295654, + "kl": 0.13209390453994274, + "learning_rate": 4.266666666666667e-07, + "loss": 0.2107, + "num_tokens": 2567410.0, + "reward": 4.875, + "reward_std": 5.25, + "rewards/reward_combined/mean": 4.875, + "rewards/reward_combined/std": 5.25, + "step": 8721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 161.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01852143369615078, + "kl": 0.008005412295460701, + "learning_rate": 4.263333333333333e-07, + "loss": 0.0004, + "num_tokens": 2567684.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 161.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02794245071709156, + "kl": 0.0004574150952976197, + "learning_rate": 4.26e-07, + "loss": 0.0, + "num_tokens": 2567927.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 161.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004467817023396492, + "kl": 0.00020632743689930066, + "learning_rate": 4.256666666666667e-07, + "loss": 0.0, + "num_tokens": 2568147.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 161.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.9460231065750122, + "kl": 0.1277504339814186, + "learning_rate": 4.2533333333333333e-07, + "loss": 0.0066, + "num_tokens": 2568464.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 161.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02923583798110485, + "kl": 0.006599893171369331, + "learning_rate": 4.25e-07, + "loss": 0.0003, + "num_tokens": 2568736.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 161.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08563487976789474, + "kl": 0.03685462847352028, + "learning_rate": 4.2466666666666663e-07, + "loss": 0.0018, + "num_tokens": 2569034.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 161.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07928966730833054, + "kl": 0.01721447065938264, + "learning_rate": 4.2433333333333336e-07, + "loss": 0.0009, + "num_tokens": 2569363.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 161.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042829133570194244, + "kl": 0.006660679820924997, + "learning_rate": 4.2400000000000004e-07, + "loss": 0.0003, + "num_tokens": 2569654.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 161.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001952077727764845, + "kl": 0.003544352948665619, + "learning_rate": 4.2366666666666666e-07, + "loss": 0.0002, + "num_tokens": 2569890.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 161.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09785790741443634, + "kl": 0.0075979826506227255, + "learning_rate": 4.2333333333333334e-07, + "loss": 0.0004, + "num_tokens": 2570166.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 161.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01978292502462864, + "kl": 0.0007598996162414551, + "learning_rate": 4.2299999999999996e-07, + "loss": 0.0, + "num_tokens": 2570378.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 161.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06639070063829422, + "kl": 0.0010628923773765564, + "learning_rate": 4.226666666666667e-07, + "loss": 0.0001, + "num_tokens": 2570588.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 161.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1997898668050766, + "kl": 0.05732985585927963, + "learning_rate": 4.2233333333333337e-07, + "loss": 0.0029, + "num_tokens": 2570862.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 161.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005731337238103151, + "kl": 0.00033209921093657613, + "learning_rate": 4.22e-07, + "loss": 0.0, + "num_tokens": 2571122.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 161.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012623523361980915, + "kl": 0.00041728348878677934, + "learning_rate": 4.2166666666666667e-07, + "loss": 0.0, + "num_tokens": 2571436.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 161.7962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.888056755065918, + "kl": 0.0346012469381094, + "learning_rate": 4.213333333333334e-07, + "loss": 0.0994, + "num_tokens": 2571731.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 8737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 161.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02708945795893669, + "kl": 0.1629871353507042, + "learning_rate": 4.21e-07, + "loss": 0.0081, + "num_tokens": 2572040.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 161.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005142015404999256, + "kl": 0.0014799535274505615, + "learning_rate": 4.2066666666666665e-07, + "loss": 0.0001, + "num_tokens": 2572256.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 161.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03366416320204735, + "kl": 0.003132957383058965, + "learning_rate": 4.203333333333333e-07, + "loss": 0.0002, + "num_tokens": 2572558.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 161.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02851632796227932, + "kl": 0.0005180761218070984, + "learning_rate": 4.2000000000000006e-07, + "loss": 0.0, + "num_tokens": 2572778.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 8741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 161.88888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0935473442077637, + "kl": 0.384590744972229, + "learning_rate": 4.196666666666667e-07, + "loss": 0.0389, + "num_tokens": 2573083.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 8742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 39.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 39.25, + "completions/mean_terminated_length": 39.25, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 161.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06690575182437897, + "kl": 0.034862760454416275, + "learning_rate": 4.1933333333333336e-07, + "loss": 0.0017, + "num_tokens": 2573464.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 161.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10117001086473465, + "kl": 0.033521804958581924, + "learning_rate": 4.19e-07, + "loss": 0.0017, + "num_tokens": 2573772.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 31.5, + "completions/mean_terminated_length": 31.5, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 161.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03286002576351166, + "kl": 0.008927557151764631, + "learning_rate": 4.1866666666666666e-07, + "loss": 0.0004, + "num_tokens": 2574126.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 161.96296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.83968186378479, + "kl": 0.0157951662549749, + "learning_rate": 4.183333333333334e-07, + "loss": 0.0537, + "num_tokens": 2574418.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 8746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 161.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20594388246536255, + "kl": 0.02551776822656393, + "learning_rate": 4.18e-07, + "loss": 0.0015, + "num_tokens": 2574718.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 162.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021271243691444397, + "kl": 0.011651545763015747, + "learning_rate": 4.176666666666667e-07, + "loss": 0.0006, + "num_tokens": 2574978.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 162.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006680145859718323, + "kl": 0.1638127788901329, + "learning_rate": 4.173333333333333e-07, + "loss": 0.0082, + "num_tokens": 2575286.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 162.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008309624157845974, + "kl": 0.000858905230415985, + "learning_rate": 4.1700000000000004e-07, + "loss": 0.0, + "num_tokens": 2575570.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 162.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08076220005750656, + "kl": 0.00554187607485801, + "learning_rate": 4.166666666666667e-07, + "loss": 0.0003, + "num_tokens": 2575846.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 162.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00872249435633421, + "kl": 0.00010294913590769283, + "learning_rate": 4.1633333333333334e-07, + "loss": 0.0, + "num_tokens": 2576102.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 162.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03481372073292732, + "kl": 0.002255752682685852, + "learning_rate": 4.1599999999999997e-07, + "loss": 0.0001, + "num_tokens": 2576318.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 162.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05893750116229057, + "kl": 0.010167696047574282, + "learning_rate": 4.1566666666666664e-07, + "loss": 0.0005, + "num_tokens": 2576594.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 162.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029528116807341576, + "kl": 0.0009664545068517327, + "learning_rate": 4.1533333333333337e-07, + "loss": 0.0001, + "num_tokens": 2576810.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 162.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020393524318933487, + "kl": 0.0013765437761321664, + "learning_rate": 4.15e-07, + "loss": 0.0001, + "num_tokens": 2577044.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 162.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035633333027362823, + "kl": 0.0019097463809885085, + "learning_rate": 4.146666666666667e-07, + "loss": 0.0001, + "num_tokens": 2577340.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 162.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01583975739777088, + "kl": 0.019105815328657627, + "learning_rate": 4.143333333333333e-07, + "loss": 0.001, + "num_tokens": 2577618.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 162.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047380611300468445, + "kl": 0.005481145344674587, + "learning_rate": 4.1400000000000003e-07, + "loss": 0.0003, + "num_tokens": 2577946.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 162.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08289684355258942, + "kl": 0.014533222652971745, + "learning_rate": 4.136666666666667e-07, + "loss": 0.0008, + "num_tokens": 2578234.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 162.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009104477241635323, + "kl": 0.00032294541597366333, + "learning_rate": 4.1333333333333333e-07, + "loss": 0.0, + "num_tokens": 2578478.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 162.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045783333480358124, + "kl": 0.09446677565574646, + "learning_rate": 4.13e-07, + "loss": 0.0047, + "num_tokens": 2578852.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 162.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.34242498874664307, + "kl": 0.048494853079319, + "learning_rate": 4.1266666666666663e-07, + "loss": 0.0024, + "num_tokens": 2579150.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 162.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02272961102426052, + "kl": 0.00021896511316299438, + "learning_rate": 4.1233333333333336e-07, + "loss": 0.0, + "num_tokens": 2579362.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 162.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03676637262105942, + "kl": 0.001942979171872139, + "learning_rate": 4.1200000000000004e-07, + "loss": 0.0001, + "num_tokens": 2579686.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 162.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03820367529988289, + "kl": 0.005818691803142428, + "learning_rate": 4.1166666666666666e-07, + "loss": 0.0003, + "num_tokens": 2579976.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 162.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0423758402466774, + "kl": 0.009468005038797855, + "learning_rate": 4.1133333333333334e-07, + "loss": 0.0005, + "num_tokens": 2580308.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 162.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13000838458538055, + "kl": 0.03317350219003856, + "learning_rate": 4.1100000000000007e-07, + "loss": 0.0017, + "num_tokens": 2580597.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 162.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018180948682129383, + "kl": 0.0035621225833892822, + "learning_rate": 4.106666666666667e-07, + "loss": 0.0002, + "num_tokens": 2580833.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 36.75, + "completions/mean_terminated_length": 36.75, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 162.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0681513249874115, + "kl": 0.06191878952085972, + "learning_rate": 4.103333333333333e-07, + "loss": 0.0031, + "num_tokens": 2581208.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 162.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03283923119306564, + "kl": 0.002975815557874739, + "learning_rate": 4.1e-07, + "loss": 0.0001, + "num_tokens": 2581500.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 162.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02938288077712059, + "kl": 0.006535356555104954, + "learning_rate": 4.096666666666667e-07, + "loss": 0.0003, + "num_tokens": 2581772.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 162.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02980111353099346, + "kl": 0.003871294902637601, + "learning_rate": 4.0933333333333335e-07, + "loss": 0.0002, + "num_tokens": 2582108.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 162.4814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.859311580657959, + "kl": 0.588653638958931, + "learning_rate": 4.09e-07, + "loss": 0.014, + "num_tokens": 2582411.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 8774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 162.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11337968707084656, + "kl": 0.02585682040080428, + "learning_rate": 4.0866666666666665e-07, + "loss": 0.0013, + "num_tokens": 2582685.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 162.5185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.144474506378174, + "kl": 0.06048629805445671, + "learning_rate": 4.083333333333333e-07, + "loss": 0.0979, + "num_tokens": 2582978.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 8776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 162.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06555739790201187, + "kl": 0.028324289247393608, + "learning_rate": 4.0800000000000005e-07, + "loss": 0.0014, + "num_tokens": 2583324.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 162.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09180674701929092, + "kl": 0.012973645003512502, + "learning_rate": 4.076666666666667e-07, + "loss": 0.0006, + "num_tokens": 2583596.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 162.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1345089077949524, + "kl": 0.01267294306308031, + "learning_rate": 4.0733333333333335e-07, + "loss": 0.0007, + "num_tokens": 2583853.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 162.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16680441796779633, + "kl": 0.05141289532184601, + "learning_rate": 4.07e-07, + "loss": 0.0026, + "num_tokens": 2584151.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 43.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 43.5, + "completions/mean_terminated_length": 43.5, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 162.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054858043789863586, + "kl": 0.04051884450018406, + "learning_rate": 4.066666666666667e-07, + "loss": 0.002, + "num_tokens": 2584549.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 162.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02385501191020012, + "kl": 0.0009342855628347024, + "learning_rate": 4.063333333333334e-07, + "loss": 0.0, + "num_tokens": 2584865.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 162.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15966592729091644, + "kl": 0.019496652763336897, + "learning_rate": 4.06e-07, + "loss": 0.001, + "num_tokens": 2585155.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 162.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0056667691096663475, + "kl": 0.00030512810917571187, + "learning_rate": 4.0566666666666663e-07, + "loss": 0.0, + "num_tokens": 2585415.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 162.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.068002843065187e-05, + "kl": 2.123415470123291e-06, + "learning_rate": 4.053333333333333e-07, + "loss": 0.0, + "num_tokens": 2585635.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 8785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 162.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03649111092090607, + "kl": 0.051969584077596664, + "learning_rate": 4.0500000000000004e-07, + "loss": 0.0026, + "num_tokens": 2585967.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 162.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2128031998872757, + "kl": 0.02754961373284459, + "learning_rate": 4.0466666666666666e-07, + "loss": 0.0016, + "num_tokens": 2586292.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 162.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0313015952706337, + "kl": 0.03878430649638176, + "learning_rate": 4.0433333333333334e-07, + "loss": 0.0019, + "num_tokens": 2586697.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 8788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 162.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033458877354860306, + "kl": 0.0011168313212692738, + "learning_rate": 4.0399999999999996e-07, + "loss": 0.0001, + "num_tokens": 2586962.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 162.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017862927168607712, + "kl": 0.00031419098377227783, + "learning_rate": 4.036666666666667e-07, + "loss": 0.0, + "num_tokens": 2587166.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 162.7962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.492960214614868, + "kl": 0.049693545675836504, + "learning_rate": 4.0333333333333337e-07, + "loss": 0.0046, + "num_tokens": 2587426.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 8791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 162.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011306415544822812, + "kl": 0.0003612823784351349, + "learning_rate": 4.03e-07, + "loss": 0.0, + "num_tokens": 2587686.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 162.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004961847327649593, + "kl": 0.00025772452499950305, + "learning_rate": 4.0266666666666667e-07, + "loss": 0.0, + "num_tokens": 2587906.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 162.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021443558856844902, + "kl": 0.002437490038573742, + "learning_rate": 4.023333333333333e-07, + "loss": 0.0001, + "num_tokens": 2588218.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 162.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015272899530827999, + "kl": 0.0017460708040744066, + "learning_rate": 4.02e-07, + "loss": 0.0001, + "num_tokens": 2588495.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 162.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05985037237405777, + "kl": 0.005212311400100589, + "learning_rate": 4.016666666666667e-07, + "loss": 0.0002, + "num_tokens": 2588759.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 162.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048719655722379684, + "kl": 0.012635144405066967, + "learning_rate": 4.0133333333333333e-07, + "loss": 0.0006, + "num_tokens": 2589020.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 162.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0654810294508934, + "kl": 0.006403079256415367, + "learning_rate": 4.01e-07, + "loss": 0.0003, + "num_tokens": 2589322.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 40.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 40.25, + "completions/mean_terminated_length": 40.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 162.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029209721833467484, + "kl": 0.005799456033855677, + "learning_rate": 4.0066666666666673e-07, + "loss": 0.0003, + "num_tokens": 2589703.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 34.25, + "completions/mean_terminated_length": 34.25, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 162.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11022688448429108, + "kl": 0.06149779632687569, + "learning_rate": 4.0033333333333336e-07, + "loss": 0.003, + "num_tokens": 2590056.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 162.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058898910880088806, + "kl": 0.008033715363126248, + "learning_rate": 4e-07, + "loss": 0.0004, + "num_tokens": 2590384.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 163.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036348968744277954, + "kl": 0.007658802671357989, + "learning_rate": 3.9966666666666666e-07, + "loss": 0.0003, + "num_tokens": 2590677.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 163.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020849348977208138, + "kl": 0.0010426198423374444, + "learning_rate": 3.993333333333334e-07, + "loss": 0.0001, + "num_tokens": 2590948.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 37.75, + "completions/mean_terminated_length": 37.75, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 163.03703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9002044200897217, + "kl": 0.38438936322927475, + "learning_rate": 3.99e-07, + "loss": 0.0187, + "num_tokens": 2591315.0, + "reward": 7.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.875, + "rewards/reward_combined/std": 0.25, + "step": 8804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 163.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19108644127845764, + "kl": 0.01586221158504486, + "learning_rate": 3.986666666666667e-07, + "loss": 0.001, + "num_tokens": 2591642.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 163.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035857606679201126, + "kl": 0.005893201567232609, + "learning_rate": 3.983333333333333e-07, + "loss": 0.0003, + "num_tokens": 2591920.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 163.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03369211405515671, + "kl": 0.004830272751860321, + "learning_rate": 3.98e-07, + "loss": 0.0002, + "num_tokens": 2592218.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 163.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023553693667054176, + "kl": 0.0012775935174431652, + "learning_rate": 3.976666666666667e-07, + "loss": 0.0001, + "num_tokens": 2592543.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 163.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11439812183380127, + "kl": 0.036335716024041176, + "learning_rate": 3.9733333333333334e-07, + "loss": 0.0018, + "num_tokens": 2592869.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 163.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001765990979038179, + "kl": 0.0035751760005950928, + "learning_rate": 3.97e-07, + "loss": 0.0002, + "num_tokens": 2593105.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.0, + "completions/mean_terminated_length": 6.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 163.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025588566437363625, + "kl": 0.0005647182551911101, + "learning_rate": 3.9666666666666665e-07, + "loss": 0.0, + "num_tokens": 2593329.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 163.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.770601769152563e-06, + "kl": 1.8998980522155762e-06, + "learning_rate": 3.963333333333334e-07, + "loss": 0.0, + "num_tokens": 2593549.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 8812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 163.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1666845828294754, + "kl": 0.04413004405796528, + "learning_rate": 3.9600000000000005e-07, + "loss": 0.0022, + "num_tokens": 2593860.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 8.25, + "completions/mean_terminated_length": 8.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 163.22222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.258151054382324, + "kl": 0.0077703624265268445, + "learning_rate": 3.956666666666667e-07, + "loss": 0.2159, + "num_tokens": 2594101.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 8814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 163.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04936312511563301, + "kl": 0.004371569724753499, + "learning_rate": 3.953333333333333e-07, + "loss": 0.0003, + "num_tokens": 2594320.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 163.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14638277888298035, + "kl": 0.0029341131448745728, + "learning_rate": 3.95e-07, + "loss": 0.0002, + "num_tokens": 2594528.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 163.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017148692160844803, + "kl": 0.018850659020245075, + "learning_rate": 3.946666666666667e-07, + "loss": 0.0011, + "num_tokens": 2594806.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 163.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03652740642428398, + "kl": 0.0014877381036058068, + "learning_rate": 3.9433333333333333e-07, + "loss": 0.0001, + "num_tokens": 2595132.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 73.75, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 73.75, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 163.3148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.496829032897949, + "kl": 0.02485764119774103, + "learning_rate": 3.94e-07, + "loss": 0.4518, + "num_tokens": 2595643.0, + "reward": 5.050000190734863, + "reward_std": 5.900000095367432, + "rewards/reward_combined/mean": 5.050000190734863, + "rewards/reward_combined/std": 5.90000057220459, + "step": 8819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 163.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11403830349445343, + "kl": 0.010286442004144192, + "learning_rate": 3.9366666666666663e-07, + "loss": 0.0005, + "num_tokens": 2595935.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 163.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012786706909537315, + "kl": 0.26647940278053284, + "learning_rate": 3.9333333333333336e-07, + "loss": 0.0133, + "num_tokens": 2596239.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 40.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 40.75, + "completions/mean_terminated_length": 40.75, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 163.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8645275235176086, + "kl": 0.10465402714908123, + "learning_rate": 3.9300000000000004e-07, + "loss": 0.0061, + "num_tokens": 2596626.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 163.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05022507905960083, + "kl": 0.005733102094382048, + "learning_rate": 3.9266666666666666e-07, + "loss": 0.0003, + "num_tokens": 2596918.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 163.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03051046095788479, + "kl": 0.00468259584158659, + "learning_rate": 3.9233333333333334e-07, + "loss": 0.0002, + "num_tokens": 2597209.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 163.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02554413303732872, + "kl": 0.0009007410626509227, + "learning_rate": 3.9199999999999996e-07, + "loss": 0.0, + "num_tokens": 2597518.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 163.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029555946588516235, + "kl": 0.00046730042959097773, + "learning_rate": 3.916666666666667e-07, + "loss": 0.0, + "num_tokens": 2597774.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 163.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.9149326682090759, + "kl": 0.07168246898800135, + "learning_rate": 3.9133333333333337e-07, + "loss": 0.0051, + "num_tokens": 2598073.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 163.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03249691426753998, + "kl": 0.0016801682650111616, + "learning_rate": 3.91e-07, + "loss": 0.0001, + "num_tokens": 2598369.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 163.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007899384014308453, + "kl": 0.16168268769979477, + "learning_rate": 3.906666666666666e-07, + "loss": 0.0081, + "num_tokens": 2598678.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 30.75, + "completions/mean_terminated_length": 30.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 163.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09292007982730865, + "kl": 0.01604555733501911, + "learning_rate": 3.903333333333334e-07, + "loss": 0.0008, + "num_tokens": 2599029.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 163.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02616010420024395, + "kl": 0.0030356637435033917, + "learning_rate": 3.9e-07, + "loss": 0.0001, + "num_tokens": 2599287.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 27.75, + "completions/mean_terminated_length": 27.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 163.55555555555554, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.106031894683838, + "kl": 0.24775194749236107, + "learning_rate": 3.8966666666666665e-07, + "loss": 0.1202, + "num_tokens": 2599634.0, + "reward": 5.375, + "reward_std": 2.462214469909668, + "rewards/reward_combined/mean": 5.375, + "rewards/reward_combined/std": 2.462214469909668, + "step": 8832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 163.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01660798117518425, + "kl": 0.00041464615787845105, + "learning_rate": 3.893333333333333e-07, + "loss": 0.0, + "num_tokens": 2599904.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 163.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02174406871199608, + "kl": 0.0007948676793603227, + "learning_rate": 3.8900000000000006e-07, + "loss": 0.0, + "num_tokens": 2600120.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 163.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004449574276804924, + "kl": 0.00028362125158309937, + "learning_rate": 3.886666666666667e-07, + "loss": 0.0, + "num_tokens": 2600364.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 163.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08093485236167908, + "kl": 0.03179318364709616, + "learning_rate": 3.8833333333333336e-07, + "loss": 0.0016, + "num_tokens": 2600664.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 163.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12022906541824341, + "kl": 0.007909214589744806, + "learning_rate": 3.88e-07, + "loss": 0.0003, + "num_tokens": 2600918.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 163.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016501858830451965, + "kl": 0.005264589213766158, + "learning_rate": 3.8766666666666666e-07, + "loss": 0.0003, + "num_tokens": 2601247.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 163.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02550019510090351, + "kl": 0.03701267670840025, + "learning_rate": 3.873333333333334e-07, + "loss": 0.0018, + "num_tokens": 2601652.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 8839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 163.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06674180179834366, + "kl": 0.004657938843593001, + "learning_rate": 3.87e-07, + "loss": 0.0002, + "num_tokens": 2601926.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 163.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0161620881408453, + "kl": 0.0010424046195112169, + "learning_rate": 3.866666666666667e-07, + "loss": 0.0001, + "num_tokens": 2602208.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 163.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0026068571023643017, + "kl": 0.0002920932893175632, + "learning_rate": 3.863333333333333e-07, + "loss": 0.0, + "num_tokens": 2602470.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.004999999888241291, + "clip_ratio/low_min": 0.004999999888241291, + "clip_ratio/region_mean": 0.004999999888241291, + "completion_length": 38.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 80.0, + "completions/max_terminated_length": 80.0, + "completions/mean_length": 38.0, + "completions/mean_terminated_length": 38.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 163.75925925925927, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.733632802963257, + "kl": 0.06945792399346828, + "learning_rate": 3.8600000000000004e-07, + "loss": 0.3832, + "num_tokens": 2602846.0, + "reward": 3.049999952316284, + "reward_std": 5.499393939971924, + "rewards/reward_combined/mean": 3.049999952316284, + "rewards/reward_combined/std": 5.499393939971924, + "step": 8843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 163.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06728813052177429, + "kl": 0.0016703461296856403, + "learning_rate": 3.856666666666667e-07, + "loss": 0.0001, + "num_tokens": 2603116.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 163.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19651243090629578, + "kl": 0.033940818160772324, + "learning_rate": 3.8533333333333334e-07, + "loss": 0.0017, + "num_tokens": 2603427.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 163.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03139230236411095, + "kl": 0.0015383082791231573, + "learning_rate": 3.8499999999999997e-07, + "loss": 0.0001, + "num_tokens": 2603700.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 163.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05385978892445564, + "kl": 0.006028156960383058, + "learning_rate": 3.8466666666666664e-07, + "loss": 0.0003, + "num_tokens": 2604004.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 163.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028173333033919334, + "kl": 0.00023711472749710083, + "learning_rate": 3.8433333333333337e-07, + "loss": 0.0, + "num_tokens": 2604216.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 163.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08144671469926834, + "kl": 0.0170698466245085, + "learning_rate": 3.84e-07, + "loss": 0.0009, + "num_tokens": 2604502.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 163.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028200753033161163, + "kl": 0.005478203878737986, + "learning_rate": 3.836666666666667e-07, + "loss": 0.0003, + "num_tokens": 2604835.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 163.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022222740575671196, + "kl": 0.011473238468170166, + "learning_rate": 3.833333333333333e-07, + "loss": 0.0006, + "num_tokens": 2605095.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 163.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003030191408470273, + "kl": 0.00035396963357925415, + "learning_rate": 3.8300000000000003e-07, + "loss": 0.0, + "num_tokens": 2605355.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 26.25, + "completions/mean_terminated_length": 26.25, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 163.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029974713921546936, + "kl": 0.09397260844707489, + "learning_rate": 3.826666666666667e-07, + "loss": 0.0047, + "num_tokens": 2605728.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 38.75, + "completions/mean_terminated_length": 38.75, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 163.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03654379025101662, + "kl": 0.05493824928998947, + "learning_rate": 3.8233333333333333e-07, + "loss": 0.0027, + "num_tokens": 2606111.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8854 + }, + { + "clip_ratio/high_max": 0.012820512987673283, + "clip_ratio/high_mean": 0.012820512987673283, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.012820512987673283, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 163.9814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.654800534248352, + "kl": 0.010154719115234911, + "learning_rate": 3.82e-07, + "loss": 0.0013, + "num_tokens": 2606434.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 8855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 164.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0006452057277783751, + "kl": 0.0012512527173385024, + "learning_rate": 3.8166666666666663e-07, + "loss": 0.0001, + "num_tokens": 2606714.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 164.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055416300892829895, + "kl": 0.0012119606253691018, + "learning_rate": 3.8133333333333336e-07, + "loss": 0.0001, + "num_tokens": 2606927.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 164.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10294818133115768, + "kl": 0.029007730074226856, + "learning_rate": 3.8100000000000004e-07, + "loss": 0.0016, + "num_tokens": 2607237.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 31.5, + "completions/mean_terminated_length": 31.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 164.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05681667476892471, + "kl": 0.04493633843958378, + "learning_rate": 3.8066666666666666e-07, + "loss": 0.0022, + "num_tokens": 2607643.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 8859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.75, + "completions/mean_terminated_length": 5.75, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 164.07407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 14.995560646057129, + "kl": 0.24177805866929702, + "learning_rate": 3.803333333333333e-07, + "loss": -0.1447, + "num_tokens": 2607874.0, + "reward": 3.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 3.875, + "rewards/reward_combined/std": 0.25, + "step": 8860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 164.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03134210780262947, + "kl": 0.005656247725710273, + "learning_rate": 3.8000000000000007e-07, + "loss": 0.0003, + "num_tokens": 2608142.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 164.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018654853338375688, + "kl": 0.003564111888408661, + "learning_rate": 3.796666666666667e-07, + "loss": 0.0002, + "num_tokens": 2608378.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 164.12962962962962, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9843974113464355, + "kl": 0.04015055298805237, + "learning_rate": 3.793333333333333e-07, + "loss": -0.0185, + "num_tokens": 2608648.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 8863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 164.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08029883354902267, + "kl": 0.004360925406217575, + "learning_rate": 3.79e-07, + "loss": 0.0003, + "num_tokens": 2608896.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 164.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004615867044776678, + "kl": 0.00013434389984467998, + "learning_rate": 3.786666666666667e-07, + "loss": 0.0, + "num_tokens": 2609168.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8865 + }, + { + "clip_ratio/high_max": 0.013888888992369175, + "clip_ratio/high_mean": 0.013888888992369175, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.013888888992369175, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 164.1851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3918683528900146, + "kl": 0.051639024168252945, + "learning_rate": 3.7833333333333335e-07, + "loss": 0.058, + "num_tokens": 2609473.0, + "reward": 5.25, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 4.5, + "step": 8866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 164.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03771123290061951, + "kl": 0.0054469656315632164, + "learning_rate": 3.78e-07, + "loss": 0.0003, + "num_tokens": 2609800.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 164.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04612196981906891, + "kl": 0.00045797228813171387, + "learning_rate": 3.7766666666666665e-07, + "loss": 0.0, + "num_tokens": 2610010.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 36.5, + "completions/mean_terminated_length": 36.5, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 164.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10120661556720734, + "kl": 0.07140768505632877, + "learning_rate": 3.773333333333333e-07, + "loss": 0.0036, + "num_tokens": 2610384.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 164.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003295034635812044, + "kl": 0.0016836468130350113, + "learning_rate": 3.7700000000000005e-07, + "loss": 0.0001, + "num_tokens": 2610696.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 164.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03494046628475189, + "kl": 0.0013030902482569218, + "learning_rate": 3.766666666666667e-07, + "loss": 0.0001, + "num_tokens": 2610961.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 164.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10659421235322952, + "kl": 0.017446937505155802, + "learning_rate": 3.7633333333333335e-07, + "loss": 0.0009, + "num_tokens": 2611233.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 164.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047392524778842926, + "kl": 0.00733506865799427, + "learning_rate": 3.76e-07, + "loss": 0.0003, + "num_tokens": 2611491.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 164.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012479480355978012, + "kl": 0.26650701463222504, + "learning_rate": 3.756666666666667e-07, + "loss": 0.0133, + "num_tokens": 2611795.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8874 + }, + { + "clip_ratio/high_max": 0.006756756920367479, + "clip_ratio/high_mean": 0.006756756920367479, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006756756920367479, + "completion_length": 41.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 41.25, + "completions/mean_terminated_length": 41.25, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 164.35185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3536951541900635, + "kl": 0.13813966512680054, + "learning_rate": 3.753333333333334e-07, + "loss": 0.0897, + "num_tokens": 2612184.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 8875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 164.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02731388621032238, + "kl": 0.005873343674466014, + "learning_rate": 3.75e-07, + "loss": 0.0003, + "num_tokens": 2612472.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 164.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006941261235624552, + "kl": 0.16369856894016266, + "learning_rate": 3.7466666666666663e-07, + "loss": 0.0082, + "num_tokens": 2612780.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 164.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13367219269275665, + "kl": 0.02340683527290821, + "learning_rate": 3.7433333333333336e-07, + "loss": 0.0012, + "num_tokens": 2613071.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 164.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004712092340923846, + "kl": 0.0012449536588974297, + "learning_rate": 3.74e-07, + "loss": 0.0001, + "num_tokens": 2613351.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.75, + "completions/mean_terminated_length": 10.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 164.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09926892817020416, + "kl": 0.004716712632216513, + "learning_rate": 3.7366666666666666e-07, + "loss": 0.0002, + "num_tokens": 2613622.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 164.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012036683969199657, + "kl": 0.000649003341095522, + "learning_rate": 3.7333333333333334e-07, + "loss": 0.0, + "num_tokens": 2613930.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 164.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.629360071499832e-05, + "kl": 2.175569534301758e-06, + "learning_rate": 3.73e-07, + "loss": 0.0, + "num_tokens": 2614150.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 8882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 164.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.0771701335906982, + "kl": 0.15470555424690247, + "learning_rate": 3.726666666666667e-07, + "loss": 0.0103, + "num_tokens": 2614368.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 164.5185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.618802785873413, + "kl": 0.7924874499440193, + "learning_rate": 3.7233333333333337e-07, + "loss": 0.01, + "num_tokens": 2614668.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 8884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 164.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.31049013137817383, + "kl": 0.04288405901752412, + "learning_rate": 3.72e-07, + "loss": 0.0025, + "num_tokens": 2614964.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 164.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02175561711192131, + "kl": 0.011518369428813457, + "learning_rate": 3.7166666666666667e-07, + "loss": 0.0006, + "num_tokens": 2615224.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 164.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06942158192396164, + "kl": 0.02773621492087841, + "learning_rate": 3.7133333333333335e-07, + "loss": 0.0014, + "num_tokens": 2615566.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 164.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0695282593369484, + "kl": 0.006201328476890922, + "learning_rate": 3.71e-07, + "loss": 0.0003, + "num_tokens": 2615904.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 164.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07365050911903381, + "kl": 0.01597495237365365, + "learning_rate": 3.706666666666667e-07, + "loss": 0.0008, + "num_tokens": 2616190.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 164.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003071509301662445, + "kl": 0.00012495517148636281, + "learning_rate": 3.7033333333333333e-07, + "loss": 0.0, + "num_tokens": 2616410.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 164.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020218145102262497, + "kl": 0.0008706642256584018, + "learning_rate": 3.7e-07, + "loss": 0.0, + "num_tokens": 2616690.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 164.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027547791600227356, + "kl": 0.0019597470527514815, + "learning_rate": 3.696666666666667e-07, + "loss": 0.0001, + "num_tokens": 2616950.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 164.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09631503373384476, + "kl": 0.02029264811426401, + "learning_rate": 3.6933333333333336e-07, + "loss": 0.0011, + "num_tokens": 2617238.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 164.7037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.198466777801514, + "kl": 0.059917932376265526, + "learning_rate": 3.69e-07, + "loss": 0.1073, + "num_tokens": 2617556.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 8894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 37.25, + "completions/mean_terminated_length": 37.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 164.72222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.567140579223633, + "kl": 0.014268872095271945, + "learning_rate": 3.686666666666667e-07, + "loss": 0.1812, + "num_tokens": 2617925.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 8895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 164.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017244728282094002, + "kl": 0.005691462545655668, + "learning_rate": 3.6833333333333334e-07, + "loss": 0.0003, + "num_tokens": 2618251.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 164.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00633375346660614, + "kl": 0.0007037085015326738, + "learning_rate": 3.68e-07, + "loss": 0.0, + "num_tokens": 2618535.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 164.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001179036684334278, + "kl": 0.0004479549825191498, + "learning_rate": 3.676666666666667e-07, + "loss": 0.0, + "num_tokens": 2618795.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 164.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07126681506633759, + "kl": 0.005395851098001003, + "learning_rate": 3.673333333333333e-07, + "loss": 0.0003, + "num_tokens": 2619093.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 164.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0403926819562912, + "kl": 0.008564659859985113, + "learning_rate": 3.67e-07, + "loss": 0.0004, + "num_tokens": 2619421.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 5.25, + "completions/mean_terminated_length": 5.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 164.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1954672932624817, + "kl": 0.013664277270436287, + "learning_rate": 3.6666666666666667e-07, + "loss": 0.0008, + "num_tokens": 2619642.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 29.75, + "completions/mean_terminated_length": 29.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 164.85185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7249271869659424, + "kl": 0.10196920670568943, + "learning_rate": 3.6633333333333334e-07, + "loss": 0.0665, + "num_tokens": 2619997.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 8902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 33.25, + "completions/mean_terminated_length": 33.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 164.87037037037038, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6210243701934814, + "kl": 0.06056203693151474, + "learning_rate": 3.66e-07, + "loss": -0.1532, + "num_tokens": 2620346.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 8903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 28.25, + "completions/mean_terminated_length": 28.25, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 164.88888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.116154193878174, + "kl": 0.12774553894996643, + "learning_rate": 3.656666666666667e-07, + "loss": 0.0808, + "num_tokens": 2620727.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 8904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 164.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012762450613081455, + "kl": 0.00012174844596302137, + "learning_rate": 3.653333333333333e-07, + "loss": 0.0, + "num_tokens": 2620983.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 164.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04183805361390114, + "kl": 0.003286240331362933, + "learning_rate": 3.6500000000000005e-07, + "loss": 0.0002, + "num_tokens": 2621249.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 164.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02714330516755581, + "kl": 0.0014906234864611179, + "learning_rate": 3.646666666666667e-07, + "loss": 0.0001, + "num_tokens": 2621576.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 164.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025136498734354973, + "kl": 0.001960758410859853, + "learning_rate": 3.643333333333333e-07, + "loss": 0.0001, + "num_tokens": 2621872.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 164.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03514412045478821, + "kl": 0.0031722472049295902, + "learning_rate": 3.6400000000000003e-07, + "loss": 0.0002, + "num_tokens": 2622167.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 165.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.282050848007202, + "kl": 0.18228881061077118, + "learning_rate": 3.6366666666666665e-07, + "loss": 0.0256, + "num_tokens": 2622456.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 8910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 165.0185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.175658226013184, + "kl": 0.17125089094042778, + "learning_rate": 3.6333333333333333e-07, + "loss": -0.2149, + "num_tokens": 2622766.0, + "reward": 6.75, + "reward_std": 2.5, + "rewards/reward_combined/mean": 6.75, + "rewards/reward_combined/std": 2.5, + "step": 8911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.75, + "completions/mean_terminated_length": 12.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 165.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13480964303016663, + "kl": 0.014778361655771732, + "learning_rate": 3.63e-07, + "loss": 0.0007, + "num_tokens": 2623033.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 165.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00694236671552062, + "kl": 0.16172915697097778, + "learning_rate": 3.626666666666667e-07, + "loss": 0.0081, + "num_tokens": 2623342.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 165.07407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 13.388091087341309, + "kl": 0.02993581583723426, + "learning_rate": 3.6233333333333336e-07, + "loss": 0.1351, + "num_tokens": 2623625.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 8914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 165.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2529824674129486, + "kl": 0.028030208311975002, + "learning_rate": 3.6200000000000004e-07, + "loss": 0.0016, + "num_tokens": 2623925.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 165.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06724633276462555, + "kl": 0.009230048395693302, + "learning_rate": 3.6166666666666666e-07, + "loss": 0.0005, + "num_tokens": 2624261.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 165.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06674270331859589, + "kl": 0.01059711305424571, + "learning_rate": 3.6133333333333334e-07, + "loss": 0.0005, + "num_tokens": 2624553.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 165.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04066598042845726, + "kl": 0.002640068531036377, + "learning_rate": 3.61e-07, + "loss": 0.0001, + "num_tokens": 2624813.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 165.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07767357677221298, + "kl": 0.04681067913770676, + "learning_rate": 3.6066666666666664e-07, + "loss": 0.0023, + "num_tokens": 2625217.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 8919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 165.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008731626905500889, + "kl": 0.0003216303884983063, + "learning_rate": 3.6033333333333337e-07, + "loss": 0.0, + "num_tokens": 2625461.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 165.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012828600592911243, + "kl": 0.00016302168660331517, + "learning_rate": 3.6e-07, + "loss": 0.0, + "num_tokens": 2625717.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 28.0, + "completions/mean_terminated_length": 28.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 165.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041724614799022675, + "kl": 0.013417571317404509, + "learning_rate": 3.5966666666666667e-07, + "loss": 0.0007, + "num_tokens": 2626057.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 165.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024062277749180794, + "kl": 0.0016484694206155837, + "learning_rate": 3.5933333333333335e-07, + "loss": 0.0001, + "num_tokens": 2626329.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 165.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013716497458517551, + "kl": 0.26632802188396454, + "learning_rate": 3.59e-07, + "loss": 0.0133, + "num_tokens": 2626633.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 165.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04783390089869499, + "kl": 0.005601051088888198, + "learning_rate": 3.5866666666666665e-07, + "loss": 0.0002, + "num_tokens": 2626947.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 165.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.47431907057762146, + "kl": 0.05029179051052779, + "learning_rate": 3.583333333333334e-07, + "loss": 0.0027, + "num_tokens": 2627229.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 165.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04834480956196785, + "kl": 0.025549123995006084, + "learning_rate": 3.58e-07, + "loss": 0.0012, + "num_tokens": 2627575.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 165.33333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.4278645515441895, + "kl": 0.011338358279317617, + "learning_rate": 3.576666666666667e-07, + "loss": 0.072, + "num_tokens": 2627868.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 8928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 165.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0020694900304079056, + "kl": 1.8887221813201904e-05, + "learning_rate": 3.5733333333333336e-07, + "loss": 0.0, + "num_tokens": 2628080.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 165.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0922953188419342, + "kl": 0.0055123771307989955, + "learning_rate": 3.57e-07, + "loss": 0.0003, + "num_tokens": 2628353.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 165.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04353654384613037, + "kl": 0.016483448445796967, + "learning_rate": 3.5666666666666666e-07, + "loss": 0.0008, + "num_tokens": 2628658.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 165.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09266700595617294, + "kl": 0.039644776843488216, + "learning_rate": 3.5633333333333333e-07, + "loss": 0.002, + "num_tokens": 2628950.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 165.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023396484553813934, + "kl": 0.0011999238631688058, + "learning_rate": 3.56e-07, + "loss": 0.0001, + "num_tokens": 2629228.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 165.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06482604891061783, + "kl": 0.010835106950253248, + "learning_rate": 3.556666666666667e-07, + "loss": 0.0005, + "num_tokens": 2629546.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 165.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009873195551335812, + "kl": 0.0727309063076973, + "learning_rate": 3.5533333333333337e-07, + "loss": 0.0037, + "num_tokens": 2629916.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 165.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0035874859895557165, + "kl": 0.00014355778694152832, + "learning_rate": 3.55e-07, + "loss": 0.0, + "num_tokens": 2630136.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 165.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.541063678218052e-05, + "kl": 2.339482307434082e-06, + "learning_rate": 3.546666666666667e-07, + "loss": 0.0, + "num_tokens": 2630356.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 8937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 165.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06692533195018768, + "kl": 0.0063443309627473354, + "learning_rate": 3.5433333333333334e-07, + "loss": 0.0003, + "num_tokens": 2630654.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 165.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0033889184705913067, + "kl": 0.0003566415543900803, + "learning_rate": 3.5399999999999997e-07, + "loss": 0.0, + "num_tokens": 2630889.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 165.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07119540870189667, + "kl": 0.0018159648170694709, + "learning_rate": 3.536666666666667e-07, + "loss": 0.0001, + "num_tokens": 2631099.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 80.5, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 80.5, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 165.57407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9200172424316406, + "kl": 0.018058304209262133, + "learning_rate": 3.533333333333333e-07, + "loss": 0.4624, + "num_tokens": 2631641.0, + "reward": 6.300000190734863, + "reward_std": 2.4000000953674316, + "rewards/reward_combined/mean": 6.300000190734863, + "rewards/reward_combined/std": 2.3999998569488525, + "step": 8941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 41.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 94.0, + "completions/max_terminated_length": 94.0, + "completions/mean_length": 41.0, + "completions/mean_terminated_length": 41.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 165.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20624007284641266, + "kl": 0.03857899643480778, + "learning_rate": 3.53e-07, + "loss": 0.0022, + "num_tokens": 2632025.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 165.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040238816291093826, + "kl": 0.048973362892866135, + "learning_rate": 3.526666666666667e-07, + "loss": 0.0024, + "num_tokens": 2632367.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 165.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06005650758743286, + "kl": 0.007193847734015435, + "learning_rate": 3.5233333333333335e-07, + "loss": 0.0004, + "num_tokens": 2632654.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 165.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05416218191385269, + "kl": 0.0069457958452403545, + "learning_rate": 3.5200000000000003e-07, + "loss": 0.0003, + "num_tokens": 2632946.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 165.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021506691351532936, + "kl": 0.0026105040742550045, + "learning_rate": 3.516666666666667e-07, + "loss": 0.0001, + "num_tokens": 2633216.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 165.6851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 12.81379222869873, + "kl": 2.445858425926417, + "learning_rate": 3.5133333333333333e-07, + "loss": 0.164, + "num_tokens": 2633498.0, + "reward": 7.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 7.25, + "rewards/reward_combined/std": 1.5, + "step": 8947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 165.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01908211223781109, + "kl": 0.0008574156381655484, + "learning_rate": 3.51e-07, + "loss": 0.0, + "num_tokens": 2633821.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 165.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05590381845831871, + "kl": 0.013555833138525486, + "learning_rate": 3.506666666666667e-07, + "loss": 0.0008, + "num_tokens": 2634095.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 165.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0392288975417614, + "kl": 0.003294804133474827, + "learning_rate": 3.503333333333333e-07, + "loss": 0.0002, + "num_tokens": 2634407.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 165.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014749441295862198, + "kl": 0.0035708002251340076, + "learning_rate": 3.5000000000000004e-07, + "loss": 0.0001, + "num_tokens": 2634667.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 165.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002090618945658207, + "kl": 0.00046838074922561646, + "learning_rate": 3.4966666666666666e-07, + "loss": 0.0, + "num_tokens": 2634927.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 165.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001910749590024352, + "kl": 0.003546901047229767, + "learning_rate": 3.4933333333333334e-07, + "loss": 0.0002, + "num_tokens": 2635163.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 165.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01774788461625576, + "kl": 0.008146382169798017, + "learning_rate": 3.49e-07, + "loss": 0.0004, + "num_tokens": 2635489.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 165.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028992164880037308, + "kl": 0.001073649211321026, + "learning_rate": 3.486666666666667e-07, + "loss": 0.0001, + "num_tokens": 2635797.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 165.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017621587961912155, + "kl": 0.000513613224029541, + "learning_rate": 3.483333333333333e-07, + "loss": 0.0, + "num_tokens": 2636009.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 35.25, + "completions/mean_terminated_length": 35.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 165.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24761372804641724, + "kl": 0.07010496780276299, + "learning_rate": 3.4800000000000005e-07, + "loss": 0.0037, + "num_tokens": 2636374.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 165.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10244481265544891, + "kl": 0.012026742100715637, + "learning_rate": 3.4766666666666667e-07, + "loss": 0.0006, + "num_tokens": 2636615.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 165.90740740740742, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.783552408218384, + "kl": 0.659954097121954, + "learning_rate": 3.4733333333333335e-07, + "loss": 0.0648, + "num_tokens": 2636876.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 8959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 38.25, + "completions/mean_terminated_length": 38.25, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 165.92592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7953356504440308, + "kl": 0.10929779708385468, + "learning_rate": 3.47e-07, + "loss": 0.0544, + "num_tokens": 2637257.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 8960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 33.5, + "completions/mean_terminated_length": 33.5, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 165.94444444444446, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8628766536712646, + "kl": 0.1270136758685112, + "learning_rate": 3.4666666666666665e-07, + "loss": -0.1134, + "num_tokens": 2637607.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 8961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 27.25, + "completions/mean_terminated_length": 27.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 165.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11938206106424332, + "kl": 0.02930101566016674, + "learning_rate": 3.463333333333333e-07, + "loss": 0.0016, + "num_tokens": 2637936.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 165.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03477209806442261, + "kl": 0.003304382844362408, + "learning_rate": 3.46e-07, + "loss": 0.0002, + "num_tokens": 2638226.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 166.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009456836618483067, + "kl": 0.00017060488607967272, + "learning_rate": 3.456666666666667e-07, + "loss": 0.0, + "num_tokens": 2638496.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 166.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01614745892584324, + "kl": 0.00039904813093016855, + "learning_rate": 3.4533333333333336e-07, + "loss": 0.0, + "num_tokens": 2638766.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 166.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07821919023990631, + "kl": 0.017789161298424006, + "learning_rate": 3.4500000000000003e-07, + "loss": 0.0009, + "num_tokens": 2639052.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 166.05555555555554, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.351588487625122, + "kl": 0.21713664382696152, + "learning_rate": 3.4466666666666666e-07, + "loss": 0.0701, + "num_tokens": 2639391.0, + "reward": 5.375, + "reward_std": 2.462214469909668, + "rewards/reward_combined/mean": 5.375, + "rewards/reward_combined/std": 2.462214469909668, + "step": 8967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 166.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11390016227960587, + "kl": 0.008551203645765781, + "learning_rate": 3.443333333333334e-07, + "loss": 0.0004, + "num_tokens": 2639674.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 166.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04560841992497444, + "kl": 0.0010352313402108848, + "learning_rate": 3.44e-07, + "loss": 0.0001, + "num_tokens": 2639887.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 166.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018355349078774452, + "kl": 0.00022319257914205082, + "learning_rate": 3.4366666666666663e-07, + "loss": 0.0, + "num_tokens": 2640143.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 166.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08908860385417938, + "kl": 0.01137493271380663, + "learning_rate": 3.4333333333333336e-07, + "loss": 0.0006, + "num_tokens": 2640473.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 166.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01968403533101082, + "kl": 0.0007319689611904323, + "learning_rate": 3.43e-07, + "loss": 0.0, + "num_tokens": 2640793.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 11.75, + "completions/mean_terminated_length": 11.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 166.16666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.014540672302246, + "kl": 0.027785656973719597, + "learning_rate": 3.4266666666666666e-07, + "loss": 0.1351, + "num_tokens": 2641060.0, + "reward": 6.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 6.5, + "rewards/reward_combined/std": 2.0, + "step": 8973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 166.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010650206357240677, + "kl": 0.0007479600608348846, + "learning_rate": 3.4233333333333334e-07, + "loss": 0.0, + "num_tokens": 2641346.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 166.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03259936720132828, + "kl": 0.002014004574448336, + "learning_rate": 3.42e-07, + "loss": 0.0001, + "num_tokens": 2641658.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 166.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027884047478437424, + "kl": 0.002687966451048851, + "learning_rate": 3.4166666666666664e-07, + "loss": 0.0002, + "num_tokens": 2641885.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 166.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08815071731805801, + "kl": 0.002794707892462611, + "learning_rate": 3.4133333333333337e-07, + "loss": 0.0002, + "num_tokens": 2642097.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 166.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060912683606147766, + "kl": 0.016964766662567854, + "learning_rate": 3.41e-07, + "loss": 0.0009, + "num_tokens": 2642381.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 166.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06269055604934692, + "kl": 0.008007180411368608, + "learning_rate": 3.4066666666666667e-07, + "loss": 0.0004, + "num_tokens": 2642712.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 166.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0740547701716423, + "kl": 0.0038766830693930387, + "learning_rate": 3.4033333333333335e-07, + "loss": 0.0002, + "num_tokens": 2642988.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 166.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021885257214307785, + "kl": 0.09568078815937042, + "learning_rate": 3.4e-07, + "loss": 0.0048, + "num_tokens": 2643360.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 166.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037093162536621094, + "kl": 0.012666834518313408, + "learning_rate": 3.396666666666667e-07, + "loss": 0.0006, + "num_tokens": 2643663.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0044247787445783615, + "clip_ratio/low_min": 0.0044247787445783615, + "clip_ratio/region_mean": 0.0044247787445783615, + "completion_length": 43.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 96.0, + "completions/max_terminated_length": 96.0, + "completions/mean_length": 43.25, + "completions/mean_terminated_length": 43.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 166.35185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.776369333267212, + "kl": 0.045760709792375565, + "learning_rate": 3.3933333333333333e-07, + "loss": 0.3509, + "num_tokens": 2644088.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 8983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 166.37037037037038, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.3478779792785645, + "kl": 0.24215000867843628, + "learning_rate": 3.39e-07, + "loss": -0.0877, + "num_tokens": 2644372.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 8984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 38.5, + "completions/mean_terminated_length": 38.5, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 166.38888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8391432762145996, + "kl": 0.09275639988481998, + "learning_rate": 3.386666666666667e-07, + "loss": 0.0384, + "num_tokens": 2644742.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 8985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 166.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004364797845482826, + "kl": 0.00026813894510269165, + "learning_rate": 3.3833333333333336e-07, + "loss": 0.0, + "num_tokens": 2644986.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 166.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013198398053646088, + "kl": 0.0007692250364925712, + "learning_rate": 3.38e-07, + "loss": 0.0, + "num_tokens": 2645254.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 166.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0594061017036438, + "kl": 0.007339270319789648, + "learning_rate": 3.376666666666667e-07, + "loss": 0.0003, + "num_tokens": 2645549.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 166.46296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.383419990539551, + "kl": 0.0505806072615087, + "learning_rate": 3.3733333333333334e-07, + "loss": 0.0497, + "num_tokens": 2645846.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 8989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 166.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022364147007465363, + "kl": 0.004210047423839569, + "learning_rate": 3.37e-07, + "loss": 0.0002, + "num_tokens": 2646136.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 166.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026250915601849556, + "kl": 0.005221094004809856, + "learning_rate": 3.366666666666667e-07, + "loss": 0.0003, + "num_tokens": 2646424.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 11.75, + "completions/mean_terminated_length": 11.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 166.5185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.70198917388916, + "kl": 0.020322605734691024, + "learning_rate": 3.363333333333333e-07, + "loss": 0.1312, + "num_tokens": 2646691.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 8992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 36.0, + "completions/mean_terminated_length": 36.0, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 166.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0409669429063797, + "kl": 0.060269128531217575, + "learning_rate": 3.36e-07, + "loss": 0.003, + "num_tokens": 2647063.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 29.0, + "completions/mean_terminated_length": 29.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 166.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7887897491455078, + "kl": 0.11246787011623383, + "learning_rate": 3.3566666666666667e-07, + "loss": 0.0064, + "num_tokens": 2647407.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 166.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0020631065126508474, + "kl": 0.0035220980644226074, + "learning_rate": 3.3533333333333334e-07, + "loss": 0.0002, + "num_tokens": 2647643.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 8995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 41.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 41.0, + "completions/mean_terminated_length": 41.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 166.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0824362188577652, + "kl": 0.032608781941235065, + "learning_rate": 3.35e-07, + "loss": 0.0018, + "num_tokens": 2648031.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 166.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15901196002960205, + "kl": 0.011640347132924944, + "learning_rate": 3.346666666666667e-07, + "loss": 0.0007, + "num_tokens": 2648306.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 8997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 166.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09696836769580841, + "kl": 0.018158008344471455, + "learning_rate": 3.343333333333333e-07, + "loss": 0.0009, + "num_tokens": 2648629.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 8998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 166.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04881034046411514, + "kl": 0.0047300157602876425, + "learning_rate": 3.3400000000000005e-07, + "loss": 0.0002, + "num_tokens": 2648943.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 8999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 166.66666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.4322829246521, + "kl": 0.0021250458667054772, + "learning_rate": 3.336666666666667e-07, + "loss": -0.0354, + "num_tokens": 2649225.0, + "reward": 7.625, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.625, + "rewards/reward_combined/std": 0.25, + "step": 9000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 166.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047932110726833344, + "kl": 0.0020593113731592894, + "learning_rate": 3.333333333333333e-07, + "loss": 0.0001, + "num_tokens": 2649525.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 166.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012752959504723549, + "kl": 0.26652343571186066, + "learning_rate": 3.3300000000000003e-07, + "loss": 0.0133, + "num_tokens": 2649829.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 166.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011644826008705422, + "kl": 2.6226043701171875e-06, + "learning_rate": 3.3266666666666665e-07, + "loss": 0.0, + "num_tokens": 2650049.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 166.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03149415925145149, + "kl": 0.16143415868282318, + "learning_rate": 3.3233333333333333e-07, + "loss": 0.0081, + "num_tokens": 2650359.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 166.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005765592213720083, + "kl": 0.00034224688715767115, + "learning_rate": 3.32e-07, + "loss": 0.0, + "num_tokens": 2650619.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 166.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023634400218725204, + "kl": 0.04280726984143257, + "learning_rate": 3.316666666666667e-07, + "loss": 0.0021, + "num_tokens": 2651023.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.25, + "completions/mean_terminated_length": 6.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 166.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05035879462957382, + "kl": 0.0018155035795643926, + "learning_rate": 3.313333333333333e-07, + "loss": 0.0001, + "num_tokens": 2651256.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 166.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007576784119009972, + "kl": 0.0014271500403992832, + "learning_rate": 3.3100000000000004e-07, + "loss": 0.0001, + "num_tokens": 2651533.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9008 + }, + { + "clip_ratio/high_max": 0.013888888992369175, + "clip_ratio/high_mean": 0.013888888992369175, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.013888888992369175, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 166.83333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.5049614906311035, + "kl": 0.24596751108765602, + "learning_rate": 3.3066666666666666e-07, + "loss": 0.0226, + "num_tokens": 2651832.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 9009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 166.85185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.396388292312622, + "kl": 0.02755722193978727, + "learning_rate": 3.3033333333333334e-07, + "loss": -0.0629, + "num_tokens": 2652132.0, + "reward": 7.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.875, + "rewards/reward_combined/std": 0.25, + "step": 9010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 166.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0171686839312315, + "kl": 0.0127438441850245, + "learning_rate": 3.3e-07, + "loss": 0.0006, + "num_tokens": 2652392.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 166.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12279807031154633, + "kl": 0.017405035556294024, + "learning_rate": 3.2966666666666664e-07, + "loss": 0.0009, + "num_tokens": 2652686.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 166.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0047224522568285465, + "kl": 0.00045462697744369507, + "learning_rate": 3.2933333333333337e-07, + "loss": 0.0, + "num_tokens": 2652946.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 166.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003492511110380292, + "kl": 0.00013096928159939125, + "learning_rate": 3.29e-07, + "loss": 0.0, + "num_tokens": 2653166.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 166.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026813959702849388, + "kl": 0.000827362178824842, + "learning_rate": 3.2866666666666667e-07, + "loss": 0.0, + "num_tokens": 2653382.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 166.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1722688227891922, + "kl": 0.020217320881783962, + "learning_rate": 3.2833333333333335e-07, + "loss": 0.001, + "num_tokens": 2653655.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 166.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013961921446025372, + "kl": 0.01194792427122593, + "learning_rate": 3.28e-07, + "loss": 0.0007, + "num_tokens": 2653929.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 167.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23052072525024414, + "kl": 0.024137687403708696, + "learning_rate": 3.2766666666666665e-07, + "loss": 0.0011, + "num_tokens": 2654268.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 167.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02728920988738537, + "kl": 0.0011898605152964592, + "learning_rate": 3.273333333333334e-07, + "loss": 0.0001, + "num_tokens": 2654560.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 34.75, + "completions/mean_terminated_length": 34.75, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 167.03703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.350365161895752, + "kl": 0.12469993159174919, + "learning_rate": 3.27e-07, + "loss": -0.1039, + "num_tokens": 2654927.0, + "reward": 7.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.875, + "rewards/reward_combined/std": 0.25, + "step": 9020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 167.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044807445257902145, + "kl": 0.032632600516080856, + "learning_rate": 3.266666666666667e-07, + "loss": 0.0016, + "num_tokens": 2655231.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 167.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013114286586642265, + "kl": 0.2664356380701065, + "learning_rate": 3.2633333333333336e-07, + "loss": 0.0133, + "num_tokens": 2655535.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 167.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031016511842608452, + "kl": 0.0012240736396051943, + "learning_rate": 3.26e-07, + "loss": 0.0001, + "num_tokens": 2655769.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 167.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006127322558313608, + "kl": 0.0004086077242391184, + "learning_rate": 3.2566666666666666e-07, + "loss": 0.0, + "num_tokens": 2656029.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 167.12962962962962, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.217532634735107, + "kl": 0.07824867963790894, + "learning_rate": 3.2533333333333333e-07, + "loss": 0.0085, + "num_tokens": 2656352.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 9025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 167.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03403575345873833, + "kl": 0.002095251576974988, + "learning_rate": 3.25e-07, + "loss": 0.0001, + "num_tokens": 2656650.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9026 + }, + { + "clip_ratio/high_max": 0.0017667844658717513, + "clip_ratio/high_mean": 0.0017667844658717513, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017667844658717513, + "completion_length": 88.5, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 88.5, + "completions/mean_terminated_length": 32.66666793823242, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 167.16666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.064964532852173, + "kl": 0.18354413658380508, + "learning_rate": 3.246666666666667e-07, + "loss": 0.3913, + "num_tokens": 2657220.0, + "reward": 2.875, + "reward_std": 4.75, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 4.75, + "step": 9027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 167.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13474050164222717, + "kl": 0.007781546883052215, + "learning_rate": 3.2433333333333337e-07, + "loss": 0.0004, + "num_tokens": 2657490.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 167.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0582008995115757, + "kl": 0.009410197380930185, + "learning_rate": 3.24e-07, + "loss": 0.0004, + "num_tokens": 2657782.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 167.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06137774884700775, + "kl": 0.006446503335610032, + "learning_rate": 3.236666666666667e-07, + "loss": 0.0003, + "num_tokens": 2658055.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 167.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009382962249219418, + "kl": 0.0012312799808569252, + "learning_rate": 3.2333333333333334e-07, + "loss": 0.0001, + "num_tokens": 2658315.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 167.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045821577310562134, + "kl": 0.006673014722764492, + "learning_rate": 3.2299999999999997e-07, + "loss": 0.0004, + "num_tokens": 2658637.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 167.27777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3322956562042236, + "kl": 0.007913945824839175, + "learning_rate": 3.226666666666667e-07, + "loss": -0.0016, + "num_tokens": 2658969.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 9033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 167.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01629730314016342, + "kl": 0.004390859045088291, + "learning_rate": 3.223333333333333e-07, + "loss": 0.0002, + "num_tokens": 2659251.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 167.3148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9045982360839844, + "kl": 0.01428734790533781, + "learning_rate": 3.22e-07, + "loss": -0.0053, + "num_tokens": 2659550.0, + "reward": 4.625, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 9035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 167.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02386181429028511, + "kl": 0.04284735023975372, + "learning_rate": 3.216666666666667e-07, + "loss": 0.0021, + "num_tokens": 2659954.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 39.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.0, + "completions/max_terminated_length": 46.0, + "completions/mean_length": 39.75, + "completions/mean_terminated_length": 39.75, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 167.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05300671234726906, + "kl": 0.025482898578047752, + "learning_rate": 3.2133333333333335e-07, + "loss": 0.0013, + "num_tokens": 2660337.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 167.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5597089529037476, + "kl": 0.06891607865691185, + "learning_rate": 3.21e-07, + "loss": 0.0037, + "num_tokens": 2660670.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 167.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04626183584332466, + "kl": 0.0034574606688693166, + "learning_rate": 3.206666666666667e-07, + "loss": 0.0002, + "num_tokens": 2660935.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 167.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005932462867349386, + "kl": 0.16389703750610352, + "learning_rate": 3.2033333333333333e-07, + "loss": 0.0082, + "num_tokens": 2661243.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 167.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014954416081309319, + "kl": 0.000551818564417772, + "learning_rate": 3.2e-07, + "loss": 0.0, + "num_tokens": 2661562.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 167.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02260546386241913, + "kl": 0.09551291167736053, + "learning_rate": 3.196666666666667e-07, + "loss": 0.0048, + "num_tokens": 2661934.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 167.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09751478582620621, + "kl": 0.0205126847140491, + "learning_rate": 3.193333333333333e-07, + "loss": 0.0012, + "num_tokens": 2662220.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 167.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013813708908855915, + "kl": 0.0005569718778133392, + "learning_rate": 3.1900000000000004e-07, + "loss": 0.0, + "num_tokens": 2662480.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 167.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001637967536225915, + "kl": 9.602904174244031e-05, + "learning_rate": 3.1866666666666666e-07, + "loss": 0.0, + "num_tokens": 2662736.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 167.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13162198662757874, + "kl": 0.027825507801026106, + "learning_rate": 3.1833333333333334e-07, + "loss": 0.0015, + "num_tokens": 2663058.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 7.5, + "completions/mean_terminated_length": 7.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 167.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09343654662370682, + "kl": 0.0032493870239704847, + "learning_rate": 3.18e-07, + "loss": 0.0002, + "num_tokens": 2663288.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 167.55555555555554, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.311008453369141, + "kl": 0.025621794629842043, + "learning_rate": 3.176666666666667e-07, + "loss": 0.0451, + "num_tokens": 2663597.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 9048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 167.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003543616272509098, + "kl": 8.435795461991802e-05, + "learning_rate": 3.173333333333333e-07, + "loss": 0.0, + "num_tokens": 2663869.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 167.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05114828422665596, + "kl": 0.006182837300002575, + "learning_rate": 3.1700000000000005e-07, + "loss": 0.0003, + "num_tokens": 2664137.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 28.75, + "completions/mean_terminated_length": 28.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 167.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4044647216796875, + "kl": 0.09645114466547966, + "learning_rate": 3.1666666666666667e-07, + "loss": 0.1221, + "num_tokens": 2664488.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 9051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 167.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019824182614684105, + "kl": 0.0007872451096773148, + "learning_rate": 3.1633333333333335e-07, + "loss": 0.0, + "num_tokens": 2664800.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 167.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024676613509655, + "kl": 0.0011943488207180053, + "learning_rate": 3.16e-07, + "loss": 0.0001, + "num_tokens": 2665066.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 167.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0048749735578894615, + "kl": 0.0005660813767462969, + "learning_rate": 3.1566666666666665e-07, + "loss": 0.0, + "num_tokens": 2665350.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 167.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017648622393608093, + "kl": 0.0005122125148773193, + "learning_rate": 3.153333333333333e-07, + "loss": 0.0, + "num_tokens": 2665562.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 29.5, + "completions/mean_terminated_length": 29.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 167.7037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.30617356300354, + "kl": 0.0075485792476683855, + "learning_rate": 3.15e-07, + "loss": 0.0947, + "num_tokens": 2665908.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 9056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 167.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06386909633874893, + "kl": 0.033163134939968586, + "learning_rate": 3.146666666666667e-07, + "loss": 0.0017, + "num_tokens": 2666180.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 167.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0004050778807140887, + "kl": 0.0012495687697082758, + "learning_rate": 3.1433333333333336e-07, + "loss": 0.0001, + "num_tokens": 2666460.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 167.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015987787628546357, + "kl": 3.56137752532959e-06, + "learning_rate": 3.1400000000000003e-07, + "loss": 0.0, + "num_tokens": 2666680.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 167.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05841468274593353, + "kl": 0.004395393072627485, + "learning_rate": 3.1366666666666666e-07, + "loss": 0.0002, + "num_tokens": 2666980.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 167.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02497151494026184, + "kl": 0.0011480699395178817, + "learning_rate": 3.133333333333334e-07, + "loss": 0.0001, + "num_tokens": 2667289.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 167.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003614462213590741, + "kl": 0.0001339554801234044, + "learning_rate": 3.13e-07, + "loss": 0.0, + "num_tokens": 2667509.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 167.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09565560519695282, + "kl": 0.02800754737108946, + "learning_rate": 3.1266666666666663e-07, + "loss": 0.0015, + "num_tokens": 2667797.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 167.85185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6577396392822266, + "kl": 0.01220496604219079, + "learning_rate": 3.1233333333333336e-07, + "loss": 0.0013, + "num_tokens": 2668088.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 9064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 167.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04903611168265343, + "kl": 0.0012084171175956726, + "learning_rate": 3.12e-07, + "loss": 0.0001, + "num_tokens": 2668298.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 32.0, + "completions/mean_terminated_length": 32.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 167.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04346535727381706, + "kl": 0.02796619851142168, + "learning_rate": 3.1166666666666666e-07, + "loss": 0.0012, + "num_tokens": 2668646.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 167.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08011380583047867, + "kl": 0.00722011923789978, + "learning_rate": 3.1133333333333334e-07, + "loss": 0.0004, + "num_tokens": 2668908.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 167.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09081485867500305, + "kl": 0.00112876296043396, + "learning_rate": 3.11e-07, + "loss": 0.0001, + "num_tokens": 2669120.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 167.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001803085790015757, + "kl": 0.003569498658180237, + "learning_rate": 3.1066666666666664e-07, + "loss": 0.0002, + "num_tokens": 2669356.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 167.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5184289216995239, + "kl": 0.03959103426313959, + "learning_rate": 3.1033333333333337e-07, + "loss": 0.0024, + "num_tokens": 2669671.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 167.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06740084290504456, + "kl": 0.0050421059131622314, + "learning_rate": 3.1e-07, + "loss": 0.0003, + "num_tokens": 2669915.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 168.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017893768846988678, + "kl": 0.01260658772662282, + "learning_rate": 3.0966666666666667e-07, + "loss": 0.0006, + "num_tokens": 2670175.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 31.75, + "completions/mean_terminated_length": 31.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 168.0185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.982280731201172, + "kl": 0.02883831597864628, + "learning_rate": 3.0933333333333335e-07, + "loss": 0.1784, + "num_tokens": 2670522.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 9073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 168.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12652860581874847, + "kl": 0.009109247475862503, + "learning_rate": 3.09e-07, + "loss": 0.0005, + "num_tokens": 2670766.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 168.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03454748913645744, + "kl": 0.0017968494212254882, + "learning_rate": 3.086666666666667e-07, + "loss": 0.0001, + "num_tokens": 2671000.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 168.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008845863863825798, + "kl": 0.00047231465578079224, + "learning_rate": 3.0833333333333333e-07, + "loss": 0.0, + "num_tokens": 2671212.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 33.75, + "completions/mean_terminated_length": 33.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 168.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13079652190208435, + "kl": 0.07701052911579609, + "learning_rate": 3.08e-07, + "loss": 0.0041, + "num_tokens": 2671627.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 168.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.709502696990967, + "kl": 0.27142958249896765, + "learning_rate": 3.076666666666667e-07, + "loss": 0.0145, + "num_tokens": 2671901.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 168.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09337548911571503, + "kl": 0.004717143252491951, + "learning_rate": 3.0733333333333336e-07, + "loss": 0.0003, + "num_tokens": 2672128.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 168.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018021861324086785, + "kl": 0.003566339612007141, + "learning_rate": 3.07e-07, + "loss": 0.0002, + "num_tokens": 2672364.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 168.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029145289212465286, + "kl": 0.0039941276190802455, + "learning_rate": 3.066666666666667e-07, + "loss": 0.0002, + "num_tokens": 2672632.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 33.75, + "completions/mean_terminated_length": 33.75, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 168.1851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.461592674255371, + "kl": 0.16257892549037933, + "learning_rate": 3.0633333333333334e-07, + "loss": -0.042, + "num_tokens": 2672995.0, + "reward": 7.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.875, + "rewards/reward_combined/std": 0.25, + "step": 9082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 168.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0022829370573163033, + "kl": 0.0014499109238386154, + "learning_rate": 3.06e-07, + "loss": 0.0001, + "num_tokens": 2673307.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 168.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021950727328658104, + "kl": 0.0012737291399389505, + "learning_rate": 3.056666666666667e-07, + "loss": 0.0001, + "num_tokens": 2673597.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 168.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05887385457754135, + "kl": 0.004343248903751373, + "learning_rate": 3.053333333333333e-07, + "loss": 0.0002, + "num_tokens": 2673873.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 168.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.792623829212971e-05, + "kl": 2.2426247596740723e-06, + "learning_rate": 3.05e-07, + "loss": 0.0, + "num_tokens": 2674093.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 168.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06482207030057907, + "kl": 0.0340889748185873, + "learning_rate": 3.0466666666666667e-07, + "loss": 0.0017, + "num_tokens": 2674393.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 168.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1505323052406311, + "kl": 0.018018494360148907, + "learning_rate": 3.0433333333333335e-07, + "loss": 0.0009, + "num_tokens": 2674730.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 168.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06641481816768646, + "kl": 0.018404459580779076, + "learning_rate": 3.04e-07, + "loss": 0.0009, + "num_tokens": 2675002.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 168.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03631287440657616, + "kl": 0.020896779373288155, + "learning_rate": 3.036666666666667e-07, + "loss": 0.0011, + "num_tokens": 2675275.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 168.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020260119810700417, + "kl": 0.004041685722768307, + "learning_rate": 3.033333333333333e-07, + "loss": 0.0002, + "num_tokens": 2675565.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 28.75, + "completions/mean_terminated_length": 28.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 168.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033487025648355484, + "kl": 0.006246573058888316, + "learning_rate": 3.0300000000000005e-07, + "loss": 0.0003, + "num_tokens": 2675908.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 168.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01766207069158554, + "kl": 0.002122357487678528, + "learning_rate": 3.026666666666667e-07, + "loss": 0.0001, + "num_tokens": 2676125.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 38.25, + "completions/mean_terminated_length": 38.25, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 168.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06749314069747925, + "kl": 0.05411577969789505, + "learning_rate": 3.023333333333333e-07, + "loss": 0.0027, + "num_tokens": 2676494.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 168.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0071084643714129925, + "kl": 0.1596687063574791, + "learning_rate": 3.0200000000000003e-07, + "loss": 0.008, + "num_tokens": 2676804.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 168.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058292195200920105, + "kl": 0.014927006792277098, + "learning_rate": 3.0166666666666665e-07, + "loss": 0.0008, + "num_tokens": 2677090.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 168.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012844592332839966, + "kl": 0.07325214520096779, + "learning_rate": 3.0133333333333333e-07, + "loss": 0.0037, + "num_tokens": 2677460.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 37.75, + "completions/mean_terminated_length": 37.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 168.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04029526561498642, + "kl": 0.03668023273348808, + "learning_rate": 3.01e-07, + "loss": 0.002, + "num_tokens": 2677835.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 168.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.292428970336914, + "kl": 0.02206529534305446, + "learning_rate": 3.006666666666667e-07, + "loss": 0.112, + "num_tokens": 2678127.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 9099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 168.5185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.2068705558776855, + "kl": 0.08380471915006638, + "learning_rate": 3.003333333333333e-07, + "loss": 0.0748, + "num_tokens": 2678465.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 9100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 168.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028131909668445587, + "kl": 0.005242582177743316, + "learning_rate": 3.0000000000000004e-07, + "loss": 0.0003, + "num_tokens": 2678733.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 7.25, + "completions/mean_terminated_length": 7.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 168.55555555555554, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.5013275146484375, + "kl": 0.12188638374209404, + "learning_rate": 2.9966666666666666e-07, + "loss": 0.2227, + "num_tokens": 2678982.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 9102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 168.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035127200186252594, + "kl": 0.005436467472463846, + "learning_rate": 2.9933333333333334e-07, + "loss": 0.0002, + "num_tokens": 2679302.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 168.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017706643790006638, + "kl": 0.01266731508076191, + "learning_rate": 2.99e-07, + "loss": 0.0006, + "num_tokens": 2679562.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 168.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.262886047363281, + "kl": 0.06739437009673566, + "learning_rate": 2.9866666666666664e-07, + "loss": 0.008, + "num_tokens": 2679838.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 9105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 168.62962962962962, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.27232027053833, + "kl": 0.20473309140652418, + "learning_rate": 2.9833333333333337e-07, + "loss": 0.0108, + "num_tokens": 2680166.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 9106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 35.75, + "completions/mean_terminated_length": 35.75, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 168.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20640438795089722, + "kl": 0.05750578595325351, + "learning_rate": 2.98e-07, + "loss": 0.0018, + "num_tokens": 2680529.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 168.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056533508002758026, + "kl": 0.03394610807299614, + "learning_rate": 2.9766666666666667e-07, + "loss": 0.0017, + "num_tokens": 2680826.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 168.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03272896260023117, + "kl": 0.0019635865464806557, + "learning_rate": 2.9733333333333335e-07, + "loss": 0.0001, + "num_tokens": 2681140.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 47.0, + "completions/max_terminated_length": 47.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 168.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1420769840478897, + "kl": 0.02583632292225957, + "learning_rate": 2.97e-07, + "loss": 0.0014, + "num_tokens": 2681447.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 168.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4552088975906372, + "kl": 0.0613188095157966, + "learning_rate": 2.9666666666666665e-07, + "loss": 0.0031, + "num_tokens": 2681756.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 168.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7372629642486572, + "kl": 0.11151259989128448, + "learning_rate": 2.963333333333334e-07, + "loss": 0.0058, + "num_tokens": 2682017.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 168.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004043793771415949, + "kl": 0.0003586001694202423, + "learning_rate": 2.96e-07, + "loss": 0.0, + "num_tokens": 2682277.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 168.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1454576551914215, + "kl": 0.019353345967829227, + "learning_rate": 2.956666666666667e-07, + "loss": 0.001, + "num_tokens": 2682555.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 168.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0465005598962307, + "kl": 0.006566162686794996, + "learning_rate": 2.9533333333333336e-07, + "loss": 0.0003, + "num_tokens": 2682854.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 168.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.43169453740119934, + "kl": 0.06690465216524899, + "learning_rate": 2.95e-07, + "loss": 0.0034, + "num_tokens": 2683152.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 168.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11691901832818985, + "kl": 0.01078946515917778, + "learning_rate": 2.9466666666666666e-07, + "loss": 0.0006, + "num_tokens": 2683487.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 168.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015972651541233063, + "kl": 0.0004284679889678955, + "learning_rate": 2.9433333333333334e-07, + "loss": 0.0, + "num_tokens": 2683699.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 168.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007666092831641436, + "kl": 0.00038790104736108333, + "learning_rate": 2.94e-07, + "loss": 0.0, + "num_tokens": 2684018.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 168.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013481322675943375, + "kl": 0.2663469910621643, + "learning_rate": 2.936666666666667e-07, + "loss": 0.0133, + "num_tokens": 2684322.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 168.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05399410054087639, + "kl": 0.0011048614978790283, + "learning_rate": 2.9333333333333337e-07, + "loss": 0.0001, + "num_tokens": 2684530.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 168.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043435242027044296, + "kl": 0.0033705367241054773, + "learning_rate": 2.93e-07, + "loss": 0.0002, + "num_tokens": 2684804.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 168.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007465688977390528, + "kl": 0.00134140788577497, + "learning_rate": 2.926666666666667e-07, + "loss": 0.0001, + "num_tokens": 2685081.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 168.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09124170243740082, + "kl": 0.012778798583894968, + "learning_rate": 2.9233333333333334e-07, + "loss": 0.0006, + "num_tokens": 2685373.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 27.5, + "completions/mean_terminated_length": 27.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 168.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0337088517844677, + "kl": 0.05377374589443207, + "learning_rate": 2.9199999999999997e-07, + "loss": 0.0027, + "num_tokens": 2685719.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 169.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0035882391966879368, + "kl": 6.958246376598254e-05, + "learning_rate": 2.916666666666667e-07, + "loss": 0.0, + "num_tokens": 2685975.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 169.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018008166924118996, + "kl": 0.0048581333830952644, + "learning_rate": 2.913333333333333e-07, + "loss": 0.0002, + "num_tokens": 2686264.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 169.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02369351126253605, + "kl": 0.04278396815061569, + "learning_rate": 2.91e-07, + "loss": 0.0021, + "num_tokens": 2686668.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 169.05555555555554, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9191031455993652, + "kl": 0.3539789766073227, + "learning_rate": 2.906666666666667e-07, + "loss": 0.0359, + "num_tokens": 2686966.0, + "reward": 4.625, + "reward_std": 2.25, + "rewards/reward_combined/mean": 4.625, + "rewards/reward_combined/std": 2.25, + "step": 9129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 169.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00015591923147439957, + "kl": 3.56137752532959e-06, + "learning_rate": 2.9033333333333335e-07, + "loss": 0.0, + "num_tokens": 2687186.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 33.75, + "completions/mean_terminated_length": 33.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 169.09259259259258, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5556657314300537, + "kl": 0.06467830576002598, + "learning_rate": 2.9e-07, + "loss": 0.0041, + "num_tokens": 2687541.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 9131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 169.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021280251443386078, + "kl": 0.000614169239270268, + "learning_rate": 2.896666666666667e-07, + "loss": 0.0, + "num_tokens": 2687760.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 38.25, + "completions/mean_terminated_length": 38.25, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 169.12962962962962, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.05633282661438, + "kl": 0.06055983155965805, + "learning_rate": 2.8933333333333333e-07, + "loss": -0.0029, + "num_tokens": 2688141.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 9133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 8.25, + "completions/mean_terminated_length": 8.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 169.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17023494839668274, + "kl": 0.010327961761504412, + "learning_rate": 2.89e-07, + "loss": 0.0007, + "num_tokens": 2688374.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 169.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022016847506165504, + "kl": 0.09566148370504379, + "learning_rate": 2.886666666666667e-07, + "loss": 0.0048, + "num_tokens": 2688746.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 169.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03291894122958183, + "kl": 0.0050873481668531895, + "learning_rate": 2.883333333333333e-07, + "loss": 0.0002, + "num_tokens": 2689073.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 169.2037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.182126045227051, + "kl": 0.06510461936704814, + "learning_rate": 2.8800000000000004e-07, + "loss": 0.1526, + "num_tokens": 2689365.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 9137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 169.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012140309438109398, + "kl": 0.0004731234657811001, + "learning_rate": 2.8766666666666666e-07, + "loss": 0.0, + "num_tokens": 2689677.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 169.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006200368050485849, + "kl": 0.16181518882513046, + "learning_rate": 2.8733333333333334e-07, + "loss": 0.0081, + "num_tokens": 2689986.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 169.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059274476021528244, + "kl": 0.014872962608933449, + "learning_rate": 2.87e-07, + "loss": 0.0008, + "num_tokens": 2690338.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 9.75, + "completions/mean_terminated_length": 9.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 169.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4888216257095337, + "kl": 0.03381285443902016, + "learning_rate": 2.866666666666667e-07, + "loss": 0.0022, + "num_tokens": 2690605.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 169.2962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.725061893463135, + "kl": 0.14928995189256966, + "learning_rate": 2.863333333333333e-07, + "loss": 0.027, + "num_tokens": 2690882.0, + "reward": 5.875, + "reward_std": 3.25, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 3.25, + "step": 9142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 169.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007302282378077507, + "kl": 0.0011429578298702836, + "learning_rate": 2.8600000000000005e-07, + "loss": 0.0001, + "num_tokens": 2691142.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 169.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054937105625867844, + "kl": 0.012293716194108129, + "learning_rate": 2.8566666666666667e-07, + "loss": 0.0006, + "num_tokens": 2691426.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 169.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007608338259160519, + "kl": 0.0014296133304014802, + "learning_rate": 2.8533333333333335e-07, + "loss": 0.0001, + "num_tokens": 2691703.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 169.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036619335412979126, + "kl": 0.00887847039848566, + "learning_rate": 2.85e-07, + "loss": 0.0005, + "num_tokens": 2692046.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 169.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048186130821704865, + "kl": 0.003134794533252716, + "learning_rate": 2.8466666666666665e-07, + "loss": 0.0002, + "num_tokens": 2692290.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 169.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001961027504876256, + "kl": 0.0035253167152404785, + "learning_rate": 2.843333333333333e-07, + "loss": 0.0002, + "num_tokens": 2692526.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 169.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026200201362371445, + "kl": 0.005162051471415907, + "learning_rate": 2.84e-07, + "loss": 0.0003, + "num_tokens": 2692814.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 169.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.085477314889431, + "kl": 0.042702607810497284, + "learning_rate": 2.836666666666667e-07, + "loss": 0.0021, + "num_tokens": 2693114.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 169.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06544790416955948, + "kl": 0.017514828126877546, + "learning_rate": 2.8333333333333336e-07, + "loss": 0.0009, + "num_tokens": 2693404.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 169.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026688124984502792, + "kl": 0.01073854649439454, + "learning_rate": 2.8300000000000003e-07, + "loss": 0.0005, + "num_tokens": 2693665.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 169.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.325022220611572, + "kl": 0.027866336633451283, + "learning_rate": 2.8266666666666666e-07, + "loss": 0.0377, + "num_tokens": 2693934.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 9153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 40.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 40.25, + "completions/mean_terminated_length": 40.25, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 169.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09739240258932114, + "kl": 0.05873563513159752, + "learning_rate": 2.8233333333333333e-07, + "loss": 0.0029, + "num_tokens": 2694311.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 169.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3076964020729065, + "kl": 0.04121387377381325, + "learning_rate": 2.82e-07, + "loss": 0.0023, + "num_tokens": 2694618.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 2.5, + "completions/mean_terminated_length": 2.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 169.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026420464739203453, + "kl": 0.0006961002945899963, + "learning_rate": 2.8166666666666663e-07, + "loss": 0.0, + "num_tokens": 2694824.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 169.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08795703947544098, + "kl": 0.032087234780192375, + "learning_rate": 2.8133333333333336e-07, + "loss": 0.0016, + "num_tokens": 2695165.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 169.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055294718593358994, + "kl": 0.023009028751403093, + "learning_rate": 2.81e-07, + "loss": 0.0011, + "num_tokens": 2695450.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 169.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014163954183459282, + "kl": 0.26626069843769073, + "learning_rate": 2.8066666666666667e-07, + "loss": 0.0133, + "num_tokens": 2695754.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 169.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011316646821796894, + "kl": 0.0006442245794460177, + "learning_rate": 2.8033333333333334e-07, + "loss": 0.0, + "num_tokens": 2696022.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 169.64814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.68283748626709, + "kl": 0.023975116200745106, + "learning_rate": 2.8e-07, + "loss": 0.0245, + "num_tokens": 2696354.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 9161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 169.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04598274081945419, + "kl": 0.0012741684913635254, + "learning_rate": 2.7966666666666664e-07, + "loss": 0.0001, + "num_tokens": 2696570.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 169.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08580449223518372, + "kl": 0.015314336866140366, + "learning_rate": 2.7933333333333337e-07, + "loss": 0.0008, + "num_tokens": 2696862.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 169.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7087838053703308, + "kl": 0.06382860301528126, + "learning_rate": 2.79e-07, + "loss": 0.0035, + "num_tokens": 2697149.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 169.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024151522666215897, + "kl": 0.012360613327473402, + "learning_rate": 2.786666666666667e-07, + "loss": 0.0007, + "num_tokens": 2697421.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.0, + "completions/max_terminated_length": 46.0, + "completions/mean_length": 38.75, + "completions/mean_terminated_length": 38.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 169.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11619424819946289, + "kl": 0.02923139650374651, + "learning_rate": 2.7833333333333335e-07, + "loss": 0.0015, + "num_tokens": 2697800.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 53.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 137.0, + "completions/max_terminated_length": 137.0, + "completions/mean_length": 53.75, + "completions/mean_terminated_length": 53.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 169.75925925925927, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5626919269561768, + "kl": 0.07971706241369247, + "learning_rate": 2.78e-07, + "loss": 0.3251, + "num_tokens": 2698251.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 9167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.75, + "completions/mean_terminated_length": 10.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 169.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053862739354372025, + "kl": 0.003571488428860903, + "learning_rate": 2.776666666666667e-07, + "loss": 0.0002, + "num_tokens": 2698522.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 7.0, + "completions/mean_terminated_length": 7.0, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 169.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0021490822546184063, + "kl": 0.0003683630784507841, + "learning_rate": 2.7733333333333333e-07, + "loss": 0.0, + "num_tokens": 2698758.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 169.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08697015792131424, + "kl": 0.032572224736213684, + "learning_rate": 2.77e-07, + "loss": 0.0016, + "num_tokens": 2699068.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 169.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02715768851339817, + "kl": 0.0014278620365075767, + "learning_rate": 2.766666666666667e-07, + "loss": 0.0001, + "num_tokens": 2699390.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 169.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030639655888080597, + "kl": 0.0005421936511993408, + "learning_rate": 2.7633333333333336e-07, + "loss": 0.0, + "num_tokens": 2699646.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 169.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08038154244422913, + "kl": 0.005284496815875173, + "learning_rate": 2.76e-07, + "loss": 0.0002, + "num_tokens": 2699918.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 169.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0639193058013916, + "kl": 0.005531937116757035, + "learning_rate": 2.756666666666667e-07, + "loss": 0.0003, + "num_tokens": 2700216.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.5, + "completions/mean_terminated_length": 4.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 169.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04740872606635094, + "kl": 0.0005561858415603638, + "learning_rate": 2.7533333333333334e-07, + "loss": 0.0, + "num_tokens": 2700430.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 169.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05378052592277527, + "kl": 0.0035414602607488632, + "learning_rate": 2.7499999999999996e-07, + "loss": 0.0002, + "num_tokens": 2700742.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 169.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07247386872768402, + "kl": 0.015047638677060604, + "learning_rate": 2.746666666666667e-07, + "loss": 0.0007, + "num_tokens": 2701066.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 169.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031231416389346123, + "kl": 0.0068037977907806635, + "learning_rate": 2.743333333333333e-07, + "loss": 0.0003, + "num_tokens": 2701322.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 169.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002868650946766138, + "kl": 7.636298687430099e-05, + "learning_rate": 2.74e-07, + "loss": 0.0, + "num_tokens": 2701592.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 170.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044867340475320816, + "kl": 0.00415095454081893, + "learning_rate": 2.7366666666666667e-07, + "loss": 0.0002, + "num_tokens": 2701892.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 170.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0876043364405632, + "kl": 0.02977957483381033, + "learning_rate": 2.7333333333333335e-07, + "loss": 0.0015, + "num_tokens": 2702233.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 170.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02200915478169918, + "kl": 0.007784502813592553, + "learning_rate": 2.73e-07, + "loss": 0.0004, + "num_tokens": 2702507.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 170.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014696286991238594, + "kl": 0.00040178000926971436, + "learning_rate": 2.726666666666667e-07, + "loss": 0.0, + "num_tokens": 2702719.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 170.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00046244912664406, + "kl": 0.0012522083707153797, + "learning_rate": 2.723333333333333e-07, + "loss": 0.0001, + "num_tokens": 2702999.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 170.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04747818782925606, + "kl": 0.007726686540991068, + "learning_rate": 2.72e-07, + "loss": 0.0004, + "num_tokens": 2703292.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 170.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0423741415143013, + "kl": 0.001446351408958435, + "learning_rate": 2.716666666666667e-07, + "loss": 0.0001, + "num_tokens": 2703552.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 170.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.266155989782419e-06, + "kl": 1.8998980522155762e-06, + "learning_rate": 2.713333333333333e-07, + "loss": 0.0, + "num_tokens": 2703772.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 2.5, + "completions/mean_terminated_length": 2.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 170.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004043549299240112, + "kl": 0.00039796531200408936, + "learning_rate": 2.7100000000000003e-07, + "loss": 0.0, + "num_tokens": 2703978.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 170.16666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.0425496101379395, + "kl": 0.06336348480544984, + "learning_rate": 2.7066666666666666e-07, + "loss": 0.0193, + "num_tokens": 2704247.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 9189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 170.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016158513724803925, + "kl": 0.0005108352343086153, + "learning_rate": 2.7033333333333333e-07, + "loss": 0.0, + "num_tokens": 2704527.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 35.75, + "completions/mean_terminated_length": 35.75, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 170.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05484510585665703, + "kl": 0.060698799788951874, + "learning_rate": 2.7e-07, + "loss": 0.003, + "num_tokens": 2704898.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 170.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030827559530735016, + "kl": 0.0036003990098834038, + "learning_rate": 2.696666666666667e-07, + "loss": 0.0002, + "num_tokens": 2705200.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 170.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017668500542640686, + "kl": 0.0007510337454732507, + "learning_rate": 2.693333333333333e-07, + "loss": 0.0, + "num_tokens": 2705526.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 170.25925925925927, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.536442756652832, + "kl": 0.04639521427452564, + "learning_rate": 2.6900000000000004e-07, + "loss": 0.0518, + "num_tokens": 2705840.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 9194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 170.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018822981044650078, + "kl": 0.0020066049182787538, + "learning_rate": 2.6866666666666666e-07, + "loss": 0.0001, + "num_tokens": 2706110.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.5, + "completions/mean_terminated_length": 5.5, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 170.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004678748548030853, + "kl": 0.00013982994278194383, + "learning_rate": 2.6833333333333334e-07, + "loss": 0.0, + "num_tokens": 2706332.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 170.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16967931389808655, + "kl": 0.021710258326493204, + "learning_rate": 2.68e-07, + "loss": 0.0016, + "num_tokens": 2706609.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 170.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018137242645025253, + "kl": 0.0023055775091052055, + "learning_rate": 2.6766666666666664e-07, + "loss": 0.0001, + "num_tokens": 2706921.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 170.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0051785982213914394, + "kl": 0.0002482414129190147, + "learning_rate": 2.6733333333333337e-07, + "loss": 0.0, + "num_tokens": 2707181.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 170.37037037037038, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.575043678283691, + "kl": 0.2401425465941429, + "learning_rate": 2.67e-07, + "loss": -0.0599, + "num_tokens": 2707480.0, + "reward": 5.75, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 2.598076105117798, + "step": 9200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 170.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10838541388511658, + "kl": 0.01027031010016799, + "learning_rate": 2.6666666666666667e-07, + "loss": 0.0005, + "num_tokens": 2707750.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 170.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04622650146484375, + "kl": 0.0029889001743867993, + "learning_rate": 2.6633333333333335e-07, + "loss": 0.0001, + "num_tokens": 2708034.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 170.42592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.80014967918396, + "kl": 0.03519435413181782, + "learning_rate": 2.66e-07, + "loss": 0.0717, + "num_tokens": 2708363.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 9203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 170.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040308836847543716, + "kl": 0.004975801100954413, + "learning_rate": 2.6566666666666665e-07, + "loss": 0.0002, + "num_tokens": 2708647.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 170.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053678594529628754, + "kl": 0.0022880625911056995, + "learning_rate": 2.653333333333334e-07, + "loss": 0.0001, + "num_tokens": 2708881.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 170.4814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2773194313049316, + "kl": 0.03563913959078491, + "learning_rate": 2.65e-07, + "loss": 0.0971, + "num_tokens": 2709187.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 9206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 170.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.3573484420776367, + "kl": 0.6315638422966003, + "learning_rate": 2.6466666666666663e-07, + "loss": 0.0316, + "num_tokens": 2709491.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 27.0, + "completions/mean_terminated_length": 27.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 170.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09753599762916565, + "kl": 0.019341569393873215, + "learning_rate": 2.6433333333333336e-07, + "loss": 0.001, + "num_tokens": 2709827.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 170.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.41413000226020813, + "kl": 0.030696485773660243, + "learning_rate": 2.64e-07, + "loss": 0.0016, + "num_tokens": 2710113.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 170.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010697687044739723, + "kl": 0.0006071812531445175, + "learning_rate": 2.6366666666666666e-07, + "loss": 0.0, + "num_tokens": 2710424.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 170.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09459370374679565, + "kl": 0.007220255443826318, + "learning_rate": 2.6333333333333334e-07, + "loss": 0.0004, + "num_tokens": 2710761.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 170.59259259259258, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.147076606750488, + "kl": 0.02942005218937993, + "learning_rate": 2.63e-07, + "loss": -0.093, + "num_tokens": 2711095.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 9212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 170.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0780385211110115, + "kl": 0.022949498146772385, + "learning_rate": 2.626666666666667e-07, + "loss": 0.0012, + "num_tokens": 2711396.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 170.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06539977341890335, + "kl": 0.007835661293938756, + "learning_rate": 2.6233333333333337e-07, + "loss": 0.0004, + "num_tokens": 2711670.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 170.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042042363435029984, + "kl": 0.0020368692348711193, + "learning_rate": 2.62e-07, + "loss": 0.0001, + "num_tokens": 2711942.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 170.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028872370719909668, + "kl": 0.0005261659680400044, + "learning_rate": 2.6166666666666667e-07, + "loss": 0.0, + "num_tokens": 2712155.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 34.5, + "completions/mean_terminated_length": 34.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 170.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.41078078746795654, + "kl": 0.06827171891927719, + "learning_rate": 2.6133333333333334e-07, + "loss": 0.0033, + "num_tokens": 2712517.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 170.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018712634919211268, + "kl": 0.0035548508167266846, + "learning_rate": 2.6099999999999997e-07, + "loss": 0.0002, + "num_tokens": 2712753.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 170.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004391403403133154, + "kl": 0.0014798715710639954, + "learning_rate": 2.606666666666667e-07, + "loss": 0.0001, + "num_tokens": 2712969.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 170.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026176869869232178, + "kl": 0.05682331882417202, + "learning_rate": 2.603333333333333e-07, + "loss": 0.0028, + "num_tokens": 2713301.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 170.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027988692745566368, + "kl": 0.09581217914819717, + "learning_rate": 2.6e-07, + "loss": 0.0048, + "num_tokens": 2713673.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 24.75, + "completions/mean_terminated_length": 24.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 170.77777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2307140827178955, + "kl": 0.03173012437764555, + "learning_rate": 2.596666666666667e-07, + "loss": 0.1443, + "num_tokens": 2714016.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 9222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 36.75, + "completions/mean_terminated_length": 36.75, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 170.7962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7073192596435547, + "kl": 0.10055483132600784, + "learning_rate": 2.5933333333333335e-07, + "loss": -0.0142, + "num_tokens": 2714379.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 9223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 170.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0860510841012001, + "kl": 0.005248288391157985, + "learning_rate": 2.59e-07, + "loss": 0.0003, + "num_tokens": 2714686.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 170.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032722461968660355, + "kl": 0.0030742096714675426, + "learning_rate": 2.586666666666667e-07, + "loss": 0.0002, + "num_tokens": 2714974.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9225 + }, + { + "clip_ratio/high_max": 0.01923076994717121, + "clip_ratio/high_mean": 0.01923076994717121, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01923076994717121, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 170.85185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.562826633453369, + "kl": 0.07377597441154649, + "learning_rate": 2.5833333333333333e-07, + "loss": 0.0402, + "num_tokens": 2715248.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 9226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 170.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004207914229482412, + "kl": 0.00027719512581825256, + "learning_rate": 2.58e-07, + "loss": 0.0, + "num_tokens": 2715492.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 170.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01793079450726509, + "kl": 0.012614931911230087, + "learning_rate": 2.576666666666667e-07, + "loss": 0.0006, + "num_tokens": 2715752.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 170.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03436558321118355, + "kl": 0.01032313471660018, + "learning_rate": 2.573333333333333e-07, + "loss": 0.0005, + "num_tokens": 2716045.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 170.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001848552841693163, + "kl": 5.713402970286552e-05, + "learning_rate": 2.5700000000000004e-07, + "loss": 0.0, + "num_tokens": 2716301.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 170.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008658486418426037, + "kl": 0.15746158361434937, + "learning_rate": 2.5666666666666666e-07, + "loss": 0.0079, + "num_tokens": 2716612.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 170.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24864767491817474, + "kl": 0.02767011895775795, + "learning_rate": 2.5633333333333334e-07, + "loss": 0.0014, + "num_tokens": 2716873.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 32.75, + "completions/mean_terminated_length": 32.75, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 170.9814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.20802640914917, + "kl": 0.0335781816393137, + "learning_rate": 2.56e-07, + "loss": 0.045, + "num_tokens": 2717284.0, + "reward": 2.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.25, + "rewards/reward_combined/std": 1.5, + "step": 9233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 171.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06469318270683289, + "kl": 0.005688388191629201, + "learning_rate": 2.556666666666667e-07, + "loss": 0.0003, + "num_tokens": 2717582.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 171.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017351731657981873, + "kl": 0.002413789741694927, + "learning_rate": 2.553333333333333e-07, + "loss": 0.0001, + "num_tokens": 2717894.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 171.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013852477073669434, + "kl": 0.26613885164260864, + "learning_rate": 2.5500000000000005e-07, + "loss": 0.0133, + "num_tokens": 2718198.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 171.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06120152398943901, + "kl": 0.015362829202786088, + "learning_rate": 2.5466666666666667e-07, + "loss": 0.0008, + "num_tokens": 2718484.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 171.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03666425496339798, + "kl": 0.005595547321718186, + "learning_rate": 2.543333333333333e-07, + "loss": 0.0003, + "num_tokens": 2718780.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 171.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029687346890568733, + "kl": 0.0023615637328475714, + "learning_rate": 2.54e-07, + "loss": 0.0001, + "num_tokens": 2719062.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 171.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018211375921964645, + "kl": 0.012510097585618496, + "learning_rate": 2.5366666666666665e-07, + "loss": 0.0006, + "num_tokens": 2719322.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 7.25, + "completions/mean_terminated_length": 7.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 171.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10099165141582489, + "kl": 0.005628938903100789, + "learning_rate": 2.533333333333333e-07, + "loss": 0.0003, + "num_tokens": 2719559.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 35.75, + "completions/mean_terminated_length": 35.75, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 171.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06273996084928513, + "kl": 0.061120785772800446, + "learning_rate": 2.53e-07, + "loss": 0.0031, + "num_tokens": 2719930.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 171.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018921660259366035, + "kl": 0.011749022640287876, + "learning_rate": 2.526666666666667e-07, + "loss": 0.0007, + "num_tokens": 2720204.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 171.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019094351679086685, + "kl": 0.0008994002782856114, + "learning_rate": 2.5233333333333336e-07, + "loss": 0.0, + "num_tokens": 2720513.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 171.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1848917156457901, + "kl": 0.1712716445326805, + "learning_rate": 2.5200000000000003e-07, + "loss": 0.0086, + "num_tokens": 2720828.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 171.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03211478516459465, + "kl": 0.005016311421059072, + "learning_rate": 2.5166666666666666e-07, + "loss": 0.0003, + "num_tokens": 2721118.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 29.25, + "completions/mean_terminated_length": 29.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 171.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3765474259853363, + "kl": 0.0724665205925703, + "learning_rate": 2.5133333333333333e-07, + "loss": 0.0031, + "num_tokens": 2721463.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 171.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04638824984431267, + "kl": 0.0047746936907060444, + "learning_rate": 2.51e-07, + "loss": 0.0002, + "num_tokens": 2721790.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 32.25, + "completions/mean_terminated_length": 32.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 171.27777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.850092887878418, + "kl": 0.05242218263447285, + "learning_rate": 2.5066666666666663e-07, + "loss": 0.0393, + "num_tokens": 2722199.0, + "reward": 2.799999952316284, + "reward_std": 0.4000000059604645, + "rewards/reward_combined/mean": 2.799999952316284, + "rewards/reward_combined/std": 0.4000000059604645, + "step": 9249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 171.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025840256363153458, + "kl": 0.09576661139726639, + "learning_rate": 2.5033333333333336e-07, + "loss": 0.0048, + "num_tokens": 2722571.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 171.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01032637245953083, + "kl": 0.0002431079774396494, + "learning_rate": 2.5e-07, + "loss": 0.0, + "num_tokens": 2722835.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 171.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06110481172800064, + "kl": 0.009592322399839759, + "learning_rate": 2.4966666666666667e-07, + "loss": 0.0005, + "num_tokens": 2723128.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 171.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01683252677321434, + "kl": 0.000428263854701072, + "learning_rate": 2.4933333333333334e-07, + "loss": 0.0, + "num_tokens": 2723398.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.25, + "completions/mean_terminated_length": 4.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 171.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05815865099430084, + "kl": 0.0010071337219415, + "learning_rate": 2.49e-07, + "loss": 0.0001, + "num_tokens": 2723611.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 171.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008881403133273125, + "kl": 0.0002932734787464142, + "learning_rate": 2.4866666666666664e-07, + "loss": 0.0, + "num_tokens": 2723855.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 171.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034985411912202835, + "kl": 0.011005035368725657, + "learning_rate": 2.4833333333333337e-07, + "loss": 0.0006, + "num_tokens": 2724178.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 171.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012652016244828701, + "kl": 0.0001588374379934976, + "learning_rate": 2.48e-07, + "loss": 0.0, + "num_tokens": 2724434.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 171.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06394850462675095, + "kl": 0.0523146316409111, + "learning_rate": 2.476666666666667e-07, + "loss": 0.0026, + "num_tokens": 2724772.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 171.46296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.047183990478516, + "kl": 0.006762051023542881, + "learning_rate": 2.4733333333333335e-07, + "loss": 0.0933, + "num_tokens": 2725051.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 9259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 171.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6576875448226929, + "kl": 0.06636911258101463, + "learning_rate": 2.47e-07, + "loss": 0.0035, + "num_tokens": 2725360.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 171.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0027701316867023706, + "kl": 0.00012449621863197535, + "learning_rate": 2.466666666666667e-07, + "loss": 0.0, + "num_tokens": 2725580.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 171.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03198763728141785, + "kl": 0.005203839216846973, + "learning_rate": 2.4633333333333333e-07, + "loss": 0.0003, + "num_tokens": 2725905.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 171.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04723777621984482, + "kl": 0.0014811183791607618, + "learning_rate": 2.46e-07, + "loss": 0.0001, + "num_tokens": 2726117.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 34.5, + "completions/mean_terminated_length": 34.5, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 171.55555555555554, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7707160711288452, + "kl": 0.14118522591888905, + "learning_rate": 2.456666666666667e-07, + "loss": -0.0743, + "num_tokens": 2726479.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 9264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 171.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09439795464277267, + "kl": 0.030508296564221382, + "learning_rate": 2.4533333333333336e-07, + "loss": 0.0015, + "num_tokens": 2726811.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.25, + "completions/mean_terminated_length": 5.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 171.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12211757898330688, + "kl": 0.007215000689029694, + "learning_rate": 2.45e-07, + "loss": 0.0004, + "num_tokens": 2727032.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 171.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.447613716125488, + "kl": 0.06826008018106222, + "learning_rate": 2.446666666666667e-07, + "loss": -0.026, + "num_tokens": 2727322.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 9267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 171.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.46034368872642517, + "kl": 0.06429689936339855, + "learning_rate": 2.4433333333333334e-07, + "loss": 0.0038, + "num_tokens": 2727612.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 171.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03711140900850296, + "kl": 0.0030567603826057166, + "learning_rate": 2.4399999999999996e-07, + "loss": 0.0002, + "num_tokens": 2727915.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 171.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06139790639281273, + "kl": 0.005101605493109673, + "learning_rate": 2.436666666666667e-07, + "loss": 0.0003, + "num_tokens": 2728179.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 171.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004615632351487875, + "kl": 0.0003375775704625994, + "learning_rate": 2.433333333333333e-07, + "loss": 0.0, + "num_tokens": 2728496.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 171.7037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.799159526824951, + "kl": 0.17506680742371827, + "learning_rate": 2.43e-07, + "loss": 0.2766, + "num_tokens": 2728791.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 9272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 171.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.255412757396698, + "kl": 0.021144443744560704, + "learning_rate": 2.4266666666666667e-07, + "loss": 0.0014, + "num_tokens": 2729079.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 9.75, + "completions/mean_terminated_length": 9.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 171.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.36697056889533997, + "kl": 0.024108875542879105, + "learning_rate": 2.4233333333333335e-07, + "loss": 0.0016, + "num_tokens": 2729346.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 37.25, + "completions/mean_terminated_length": 37.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 171.75925925925927, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.15786075592041, + "kl": 0.6303621083498001, + "learning_rate": 2.42e-07, + "loss": 0.068, + "num_tokens": 2729711.0, + "reward": 4.0, + "reward_std": 2.915475845336914, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 2.915475845336914, + "step": 9275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 171.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029184000566601753, + "kl": 0.005045531550422311, + "learning_rate": 2.416666666666667e-07, + "loss": 0.0003, + "num_tokens": 2729993.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 171.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02143452689051628, + "kl": 0.007529604714363813, + "learning_rate": 2.413333333333333e-07, + "loss": 0.0004, + "num_tokens": 2730287.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 171.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12209755182266235, + "kl": 0.007590380730107427, + "learning_rate": 2.41e-07, + "loss": 0.0005, + "num_tokens": 2730574.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 171.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07218904048204422, + "kl": 0.016672561643645167, + "learning_rate": 2.406666666666667e-07, + "loss": 0.001, + "num_tokens": 2730856.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 12.25, + "completions/mean_terminated_length": 12.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 171.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09310668706893921, + "kl": 0.009345972328446805, + "learning_rate": 2.403333333333333e-07, + "loss": 0.0005, + "num_tokens": 2731133.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 171.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.34855785965919495, + "kl": 0.08765563741326332, + "learning_rate": 2.4000000000000003e-07, + "loss": 0.0047, + "num_tokens": 2731447.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 171.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025222888216376305, + "kl": 0.006499540293589234, + "learning_rate": 2.3966666666666666e-07, + "loss": 0.0003, + "num_tokens": 2731788.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 171.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017833812162280083, + "kl": 0.0035702288150787354, + "learning_rate": 2.3933333333333333e-07, + "loss": 0.0002, + "num_tokens": 2732024.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 171.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017632756382226944, + "kl": 0.0005254000425338745, + "learning_rate": 2.39e-07, + "loss": 0.0, + "num_tokens": 2732236.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 171.94444444444446, + "frac_reward_zero_std": 0.0, + "grad_norm": 16.372608184814453, + "kl": 0.10826432891190052, + "learning_rate": 2.386666666666667e-07, + "loss": 0.0565, + "num_tokens": 2732541.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 9285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 171.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.463217480108142e-05, + "kl": 2.205371856689453e-06, + "learning_rate": 2.3833333333333334e-07, + "loss": 0.0, + "num_tokens": 2732761.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 171.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.062440160661935806, + "kl": 0.010460796765983105, + "learning_rate": 2.3800000000000001e-07, + "loss": 0.0006, + "num_tokens": 2733091.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 172.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011701135896146297, + "kl": 0.0007465392409358174, + "learning_rate": 2.3766666666666666e-07, + "loss": 0.0, + "num_tokens": 2733351.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 172.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03472888097167015, + "kl": 0.005237925099208951, + "learning_rate": 2.3733333333333331e-07, + "loss": 0.0003, + "num_tokens": 2733647.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 172.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19062164425849915, + "kl": 0.021663925144821405, + "learning_rate": 2.3700000000000002e-07, + "loss": 0.001, + "num_tokens": 2733943.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 172.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02929820492863655, + "kl": 0.022063929587602615, + "learning_rate": 2.3666666666666667e-07, + "loss": 0.0011, + "num_tokens": 2734218.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 172.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011094835645053536, + "kl": 2.3543834686279297e-06, + "learning_rate": 2.3633333333333335e-07, + "loss": 0.0, + "num_tokens": 2734438.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 26.75, + "completions/mean_terminated_length": 26.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 172.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05419840291142464, + "kl": 0.01258205994963646, + "learning_rate": 2.36e-07, + "loss": 0.0006, + "num_tokens": 2734773.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 172.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017215872183442116, + "kl": 0.0005730873963329941, + "learning_rate": 2.3566666666666667e-07, + "loss": 0.0, + "num_tokens": 2735079.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 172.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2060709148645401, + "kl": 0.018995384220033884, + "learning_rate": 2.3533333333333332e-07, + "loss": 0.001, + "num_tokens": 2735397.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 172.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03913000226020813, + "kl": 0.01191510446369648, + "learning_rate": 2.3500000000000003e-07, + "loss": 0.0006, + "num_tokens": 2735693.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 172.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09895136207342148, + "kl": 0.03540264815092087, + "learning_rate": 2.3466666666666668e-07, + "loss": 0.0018, + "num_tokens": 2736017.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 172.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10670503973960876, + "kl": 0.007496505975723267, + "learning_rate": 2.3433333333333335e-07, + "loss": 0.0004, + "num_tokens": 2736278.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 172.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01744152419269085, + "kl": 0.012766205705702305, + "learning_rate": 2.34e-07, + "loss": 0.0006, + "num_tokens": 2736538.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 172.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1122836172580719, + "kl": 0.013083487749099731, + "learning_rate": 2.3366666666666665e-07, + "loss": 0.0007, + "num_tokens": 2736846.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 172.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3355092704296112, + "kl": 0.03203204367309809, + "learning_rate": 2.3333333333333333e-07, + "loss": 0.0018, + "num_tokens": 2737164.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 172.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05035262182354927, + "kl": 0.0025760321877896786, + "learning_rate": 2.3299999999999998e-07, + "loss": 0.0001, + "num_tokens": 2737420.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 8.75, + "completions/mean_terminated_length": 8.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 172.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08612331002950668, + "kl": 0.0055491626262664795, + "learning_rate": 2.3266666666666669e-07, + "loss": 0.0004, + "num_tokens": 2737655.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 172.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012322024442255497, + "kl": 0.15765418857336044, + "learning_rate": 2.3233333333333334e-07, + "loss": 0.0079, + "num_tokens": 2737966.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 172.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07245767116546631, + "kl": 0.00193899535224773, + "learning_rate": 2.32e-07, + "loss": 0.0001, + "num_tokens": 2738238.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 172.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024333052337169647, + "kl": 0.008449589367955923, + "learning_rate": 2.3166666666666666e-07, + "loss": 0.0004, + "num_tokens": 2738512.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 172.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0030806667637079954, + "kl": 4.373490810394287e-05, + "learning_rate": 2.3133333333333337e-07, + "loss": 0.0, + "num_tokens": 2738724.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 172.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12267035990953445, + "kl": 0.019190243910998106, + "learning_rate": 2.31e-07, + "loss": 0.001, + "num_tokens": 2739057.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 172.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021785197779536247, + "kl": 0.001960778550710529, + "learning_rate": 2.306666666666667e-07, + "loss": 0.0001, + "num_tokens": 2739329.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 172.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015194157138466835, + "kl": 0.00043241679668426514, + "learning_rate": 2.3033333333333334e-07, + "loss": 0.0, + "num_tokens": 2739541.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 172.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007095846347510815, + "kl": 0.00037607570993714035, + "learning_rate": 2.3e-07, + "loss": 0.0, + "num_tokens": 2739861.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 172.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021073205396533012, + "kl": 0.0006072536125429906, + "learning_rate": 2.2966666666666667e-07, + "loss": 0.0, + "num_tokens": 2740080.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 172.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03315271809697151, + "kl": 0.007109067548299208, + "learning_rate": 2.2933333333333332e-07, + "loss": 0.0004, + "num_tokens": 2740350.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 30.75, + "completions/mean_terminated_length": 30.75, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 172.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08940380811691284, + "kl": 0.05822291411459446, + "learning_rate": 2.2900000000000003e-07, + "loss": 0.0029, + "num_tokens": 2740753.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 172.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008731303736567497, + "kl": 0.0004929818242089823, + "learning_rate": 2.2866666666666665e-07, + "loss": 0.0, + "num_tokens": 2741015.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 172.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06337254494428635, + "kl": 0.0038957372307777405, + "learning_rate": 2.2833333333333335e-07, + "loss": 0.0002, + "num_tokens": 2741283.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 172.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018643449991941452, + "kl": 0.0019971057190559804, + "learning_rate": 2.28e-07, + "loss": 0.0001, + "num_tokens": 2741553.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 172.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012605863623321056, + "kl": 0.000526178628206253, + "learning_rate": 2.2766666666666668e-07, + "loss": 0.0, + "num_tokens": 2741813.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 172.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5165688991546631, + "kl": 0.041914600413292646, + "learning_rate": 2.2733333333333333e-07, + "loss": 0.0023, + "num_tokens": 2742153.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 172.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13938301801681519, + "kl": 0.011621064433711581, + "learning_rate": 2.2699999999999998e-07, + "loss": 0.0006, + "num_tokens": 2742437.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 37.25, + "completions/mean_terminated_length": 37.25, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 172.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.613528251647949, + "kl": 0.06211712956428528, + "learning_rate": 2.2666666666666668e-07, + "loss": -0.0025, + "num_tokens": 2742814.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 9321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 172.62962962962962, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.276278495788574, + "kl": 0.05184578709304333, + "learning_rate": 2.2633333333333334e-07, + "loss": -0.0061, + "num_tokens": 2743141.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 9322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 37.5, + "completions/mean_terminated_length": 37.5, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 172.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03256721794605255, + "kl": 0.05650538019835949, + "learning_rate": 2.26e-07, + "loss": 0.0028, + "num_tokens": 2743507.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 172.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000321406900184229, + "kl": 0.001245351741090417, + "learning_rate": 2.2566666666666666e-07, + "loss": 0.0001, + "num_tokens": 2743787.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 172.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004895047750324011, + "kl": 0.00033820047974586487, + "learning_rate": 2.2533333333333334e-07, + "loss": 0.0, + "num_tokens": 2744031.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 172.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013029099442064762, + "kl": 0.002477428744896315, + "learning_rate": 2.25e-07, + "loss": 0.0001, + "num_tokens": 2744319.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 23.75, + "completions/mean_terminated_length": 23.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 172.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21533021330833435, + "kl": 0.03718895465135574, + "learning_rate": 2.246666666666667e-07, + "loss": 0.0019, + "num_tokens": 2744634.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 172.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07662699371576309, + "kl": 0.014657713938504457, + "learning_rate": 2.2433333333333334e-07, + "loss": 0.0008, + "num_tokens": 2744908.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 75.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 75.0, + "completions/mean_terminated_length": 14.666666984558105, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 172.75925925925927, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.632269859313965, + "kl": 0.1661722231656313, + "learning_rate": 2.2400000000000002e-07, + "loss": 0.4594, + "num_tokens": 2745440.0, + "reward": 5.550000190734863, + "reward_std": 3.9000003337860107, + "rewards/reward_combined/mean": 5.550000190734863, + "rewards/reward_combined/std": 3.9000000953674316, + "step": 9329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 49.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 49.75, + "completions/mean_terminated_length": 49.75, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 172.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05726558715105057, + "kl": 0.010915250750258565, + "learning_rate": 2.2366666666666667e-07, + "loss": 0.0006, + "num_tokens": 2745859.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 172.7962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6450158357620239, + "kl": 0.5728609263896942, + "learning_rate": 2.2333333333333332e-07, + "loss": 0.0422, + "num_tokens": 2746164.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 9331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.25, + "completions/mean_terminated_length": 6.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 172.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2203519493341446, + "kl": 0.02634395915083587, + "learning_rate": 2.23e-07, + "loss": 0.0012, + "num_tokens": 2746397.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 172.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03175600990653038, + "kl": 0.010970419738441706, + "learning_rate": 2.2266666666666665e-07, + "loss": 0.0006, + "num_tokens": 2746689.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 36.0, + "completions/mean_terminated_length": 36.0, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 172.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5622710585594177, + "kl": 0.17441180627793074, + "learning_rate": 2.2233333333333335e-07, + "loss": 0.0075, + "num_tokens": 2747057.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 172.87037037037038, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5868734121322632, + "kl": 0.3006749153137207, + "learning_rate": 2.22e-07, + "loss": -0.004, + "num_tokens": 2747427.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 9335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 172.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017449725419282913, + "kl": 0.0035766512155532837, + "learning_rate": 2.2166666666666668e-07, + "loss": 0.0002, + "num_tokens": 2747663.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 172.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01739422045648098, + "kl": 0.0001802891492843628, + "learning_rate": 2.2133333333333333e-07, + "loss": 0.0, + "num_tokens": 2747871.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 28.5, + "completions/mean_terminated_length": 28.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 172.92592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.944025754928589, + "kl": 0.06725234352052212, + "learning_rate": 2.2100000000000003e-07, + "loss": 0.1822, + "num_tokens": 2748213.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 9338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 172.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008182493038475513, + "kl": 0.0011069655301980674, + "learning_rate": 2.2066666666666666e-07, + "loss": 0.0001, + "num_tokens": 2748473.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 28.0, + "completions/mean_terminated_length": 28.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 172.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.060744285583496, + "kl": 0.2748560756444931, + "learning_rate": 2.2033333333333336e-07, + "loss": 0.015, + "num_tokens": 2748821.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 172.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06978320330381393, + "kl": 0.011245114263147116, + "learning_rate": 2.2e-07, + "loss": 0.0006, + "num_tokens": 2749103.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 173.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.015462875366211, + "kl": 0.05356736574321985, + "learning_rate": 2.1966666666666666e-07, + "loss": -0.0453, + "num_tokens": 2749388.0, + "reward": 7.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.875, + "rewards/reward_combined/std": 0.25, + "step": 9342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 173.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02747127041220665, + "kl": 0.005177264800295234, + "learning_rate": 2.1933333333333334e-07, + "loss": 0.0002, + "num_tokens": 2749704.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.75, + "completions/mean_terminated_length": 12.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 173.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02128387801349163, + "kl": 0.02021583146415651, + "learning_rate": 2.19e-07, + "loss": 0.001, + "num_tokens": 2749979.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 173.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002335039433091879, + "kl": 0.0014481768012046814, + "learning_rate": 2.186666666666667e-07, + "loss": 0.0001, + "num_tokens": 2750291.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 173.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019124042242765427, + "kl": 0.0006656706391368061, + "learning_rate": 2.1833333333333332e-07, + "loss": 0.0, + "num_tokens": 2750597.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 173.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005986900068819523, + "kl": 0.0034085522347595543, + "learning_rate": 2.1800000000000002e-07, + "loss": 0.0002, + "num_tokens": 2750855.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 173.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015862395986914635, + "kl": 0.003157562459819019, + "learning_rate": 2.1766666666666667e-07, + "loss": 0.0002, + "num_tokens": 2751143.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 173.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016466058790683746, + "kl": 0.04704936593770981, + "learning_rate": 2.1733333333333335e-07, + "loss": 0.0024, + "num_tokens": 2751547.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 173.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012452344410121441, + "kl": 0.0005145557224750519, + "learning_rate": 2.17e-07, + "loss": 0.0, + "num_tokens": 2751807.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 33.5, + "completions/mean_terminated_length": 33.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 173.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020508766174316406, + "kl": 0.01153493532910943, + "learning_rate": 2.1666666666666665e-07, + "loss": 0.0004, + "num_tokens": 2752161.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 173.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013954057358205318, + "kl": 0.0005302221106830984, + "learning_rate": 2.1633333333333335e-07, + "loss": 0.0, + "num_tokens": 2752480.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 173.2037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.058448314666748, + "kl": 0.13925595860928297, + "learning_rate": 2.16e-07, + "loss": -0.0518, + "num_tokens": 2752744.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 9353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 173.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03587964549660683, + "kl": 0.006062433822080493, + "learning_rate": 2.1566666666666668e-07, + "loss": 0.0003, + "num_tokens": 2753026.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 173.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6479862928390503, + "kl": 0.17314231861382723, + "learning_rate": 2.1533333333333333e-07, + "loss": 0.0084, + "num_tokens": 2753316.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 173.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027127057313919067, + "kl": 0.05456158146262169, + "learning_rate": 2.15e-07, + "loss": 0.0027, + "num_tokens": 2753648.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 173.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018730001524090767, + "kl": 0.000610843300819397, + "learning_rate": 2.1466666666666666e-07, + "loss": 0.0, + "num_tokens": 2753860.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 173.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.952392580686137e-05, + "kl": 2.034008502960205e-06, + "learning_rate": 2.1433333333333336e-07, + "loss": 0.0, + "num_tokens": 2754080.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 173.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007977486588060856, + "kl": 0.0005839644290972501, + "learning_rate": 2.14e-07, + "loss": 0.0, + "num_tokens": 2754362.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 173.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06698492169380188, + "kl": 0.017664545215666294, + "learning_rate": 2.136666666666667e-07, + "loss": 0.0009, + "num_tokens": 2754687.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 173.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019937697798013687, + "kl": 0.00027641008637147024, + "learning_rate": 2.1333333333333334e-07, + "loss": 0.0, + "num_tokens": 2754953.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.25, + "completions/mean_terminated_length": 6.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 173.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045142341405153275, + "kl": 0.002216329798102379, + "learning_rate": 2.13e-07, + "loss": 0.0001, + "num_tokens": 2755186.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 173.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015254939906299114, + "kl": 0.2658967822790146, + "learning_rate": 2.1266666666666667e-07, + "loss": 0.0133, + "num_tokens": 2755490.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 173.40740740740742, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.326937675476074, + "kl": 0.052946810610592365, + "learning_rate": 2.1233333333333332e-07, + "loss": -0.028, + "num_tokens": 2755802.0, + "reward": 6.125, + "reward_std": 3.75, + "rewards/reward_combined/mean": 6.125, + "rewards/reward_combined/std": 3.75, + "step": 9364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 3.75, + "completions/mean_terminated_length": 3.75, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 173.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015295280143618584, + "kl": 0.0021119669545441866, + "learning_rate": 2.1200000000000002e-07, + "loss": 0.0001, + "num_tokens": 2756013.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 173.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015741040697321296, + "kl": 8.230805542552844e-05, + "learning_rate": 2.1166666666666667e-07, + "loss": 0.0, + "num_tokens": 2756233.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 173.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06418458372354507, + "kl": 0.009016437456011772, + "learning_rate": 2.1133333333333335e-07, + "loss": 0.0005, + "num_tokens": 2756577.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 173.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03234009072184563, + "kl": 0.002503427502233535, + "learning_rate": 2.11e-07, + "loss": 0.0001, + "num_tokens": 2756879.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 173.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0013644895516335964, + "kl": 3.719329833984375e-05, + "learning_rate": 2.106666666666667e-07, + "loss": 0.0, + "num_tokens": 2757091.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 173.5185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.042857646942139, + "kl": 0.11064870469272137, + "learning_rate": 2.1033333333333332e-07, + "loss": 0.1282, + "num_tokens": 2757370.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 9370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 173.53703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.162346363067627, + "kl": 0.03916444256901741, + "learning_rate": 2.1000000000000003e-07, + "loss": 0.0437, + "num_tokens": 2757710.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 9371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 173.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.087526835501194, + "kl": 0.004464875208213925, + "learning_rate": 2.0966666666666668e-07, + "loss": 0.0003, + "num_tokens": 2757937.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 173.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06550052762031555, + "kl": 0.04014564026147127, + "learning_rate": 2.0933333333333333e-07, + "loss": 0.0021, + "num_tokens": 2758228.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 26.25, + "completions/mean_terminated_length": 26.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 173.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07102903723716736, + "kl": 0.02995441108942032, + "learning_rate": 2.09e-07, + "loss": 0.0015, + "num_tokens": 2758553.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 8.25, + "completions/mean_terminated_length": 8.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 173.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 10.05695915222168, + "kl": 0.04724416509270668, + "learning_rate": 2.0866666666666666e-07, + "loss": 0.0338, + "num_tokens": 2758798.0, + "reward": 3.125, + "reward_std": 1.75, + "rewards/reward_combined/mean": 3.125, + "rewards/reward_combined/std": 1.75, + "step": 9375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 173.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008338548243045807, + "kl": 0.0011786073446273804, + "learning_rate": 2.0833333333333336e-07, + "loss": 0.0001, + "num_tokens": 2759058.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 173.64814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.42862752079963684, + "kl": 0.5528724770992994, + "learning_rate": 2.0799999999999998e-07, + "loss": -0.0377, + "num_tokens": 2759344.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 9377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.009803921915590763, + "clip_ratio/low_min": 0.009803921915590763, + "clip_ratio/region_mean": 0.009803921915590763, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 173.66666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.839827537536621, + "kl": 0.10357399843633175, + "learning_rate": 2.0766666666666669e-07, + "loss": 0.1386, + "num_tokens": 2759671.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 9378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 173.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03441961854696274, + "kl": 0.1616528481245041, + "learning_rate": 2.0733333333333334e-07, + "loss": 0.0081, + "num_tokens": 2759981.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 173.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018648155964910984, + "kl": 0.0035570859909057617, + "learning_rate": 2.0700000000000001e-07, + "loss": 0.0002, + "num_tokens": 2760217.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 173.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04937727749347687, + "kl": 0.00658849161118269, + "learning_rate": 2.0666666666666666e-07, + "loss": 0.0003, + "num_tokens": 2760508.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 173.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017838289961218834, + "kl": 0.012574596330523491, + "learning_rate": 2.0633333333333331e-07, + "loss": 0.0006, + "num_tokens": 2760768.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 27.75, + "completions/mean_terminated_length": 27.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 173.75925925925927, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8720903396606445, + "kl": 0.31305863335728645, + "learning_rate": 2.0600000000000002e-07, + "loss": -0.0119, + "num_tokens": 2761107.0, + "reward": 4.0, + "reward_std": 4.690415859222412, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 4.690415859222412, + "step": 9383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 173.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031148681417107582, + "kl": 0.0011567730689421296, + "learning_rate": 2.0566666666666667e-07, + "loss": 0.0001, + "num_tokens": 2761403.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 173.7962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6330201625823975, + "kl": 0.318508045602357, + "learning_rate": 2.0533333333333335e-07, + "loss": -0.0655, + "num_tokens": 2761675.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 9385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 26.75, + "completions/mean_terminated_length": 26.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 173.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09389530122280121, + "kl": 0.0770426094532013, + "learning_rate": 2.05e-07, + "loss": 0.0039, + "num_tokens": 2762050.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 36.25, + "completions/mean_terminated_length": 36.25, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 173.83333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0350518226623535, + "kl": 0.09583992511034012, + "learning_rate": 2.0466666666666667e-07, + "loss": -0.0099, + "num_tokens": 2762411.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 9387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 173.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11118227988481522, + "kl": 0.01862932974472642, + "learning_rate": 2.0433333333333332e-07, + "loss": 0.0009, + "num_tokens": 2762686.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 173.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00037408937350846827, + "kl": 0.0012370496988296509, + "learning_rate": 2.0400000000000003e-07, + "loss": 0.0001, + "num_tokens": 2762966.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 173.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0014750309055671096, + "kl": 6.294846571108792e-05, + "learning_rate": 2.0366666666666668e-07, + "loss": 0.0, + "num_tokens": 2763222.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 24.5, + "completions/mean_terminated_length": 24.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 173.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09263962507247925, + "kl": 0.01663209032267332, + "learning_rate": 2.0333333333333335e-07, + "loss": 0.0009, + "num_tokens": 2763564.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 173.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029124464839696884, + "kl": 0.008848333265632391, + "learning_rate": 2.03e-07, + "loss": 0.0004, + "num_tokens": 2763838.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9392 + }, + { + "clip_ratio/high_max": 0.006666666828095913, + "clip_ratio/high_mean": 0.006666666828095913, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006666666828095913, + "completion_length": 35.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 35.0, + "completions/mean_terminated_length": 35.0, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 173.94444444444446, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.122303009033203, + "kl": 0.14825211837887764, + "learning_rate": 2.0266666666666666e-07, + "loss": -0.0785, + "num_tokens": 2764206.0, + "reward": 6.75, + "reward_std": 2.5, + "rewards/reward_combined/mean": 6.75, + "rewards/reward_combined/std": 2.5, + "step": 9393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 37.0, + "completions/mean_terminated_length": 37.0, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 173.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03227252885699272, + "kl": 0.039860278367996216, + "learning_rate": 2.0233333333333333e-07, + "loss": 0.002, + "num_tokens": 2764578.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 173.9814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.471002101898193, + "kl": 0.032280536368489265, + "learning_rate": 2.0199999999999998e-07, + "loss": 0.3484, + "num_tokens": 2764878.0, + "reward": 5.875, + "reward_std": 2.136000871658325, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 2.136000871658325, + "step": 9395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 174.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07729319483041763, + "kl": 0.002565067959949374, + "learning_rate": 2.0166666666666669e-07, + "loss": 0.0001, + "num_tokens": 2765147.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 174.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03395868465304375, + "kl": 0.00591424060985446, + "learning_rate": 2.0133333333333334e-07, + "loss": 0.0003, + "num_tokens": 2765478.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 174.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013296353630721569, + "kl": 0.000528055927134119, + "learning_rate": 2.01e-07, + "loss": 0.0, + "num_tokens": 2765798.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 174.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008081411942839622, + "kl": 0.001099076820537448, + "learning_rate": 2.0066666666666666e-07, + "loss": 0.0001, + "num_tokens": 2766058.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 174.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06403493881225586, + "kl": 0.0036330987350083888, + "learning_rate": 2.0033333333333337e-07, + "loss": 0.0002, + "num_tokens": 2766356.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 174.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05856318026781082, + "kl": 0.013797045103274286, + "learning_rate": 2e-07, + "loss": 0.0007, + "num_tokens": 2766642.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 174.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03477952256798744, + "kl": 0.008628587384009734, + "learning_rate": 1.996666666666667e-07, + "loss": 0.0005, + "num_tokens": 2766929.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 174.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02780861221253872, + "kl": 0.0044317287392914295, + "learning_rate": 1.9933333333333334e-07, + "loss": 0.0002, + "num_tokens": 2767211.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 174.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03354954719543457, + "kl": 0.003975623272708617, + "learning_rate": 1.99e-07, + "loss": 0.0002, + "num_tokens": 2767471.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 174.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041201986372470856, + "kl": 0.05074997805058956, + "learning_rate": 1.9866666666666667e-07, + "loss": 0.0026, + "num_tokens": 2767839.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 174.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0028058665338903666, + "kl": 0.00010259449481964111, + "learning_rate": 1.9833333333333332e-07, + "loss": 0.0, + "num_tokens": 2768051.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 174.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027216970920562744, + "kl": 0.005215051583945751, + "learning_rate": 1.9800000000000003e-07, + "loss": 0.0003, + "num_tokens": 2768319.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 174.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15924301743507385, + "kl": 0.026444242801517248, + "learning_rate": 1.9766666666666665e-07, + "loss": 0.0012, + "num_tokens": 2768626.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 29.75, + "completions/mean_terminated_length": 29.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 174.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04549303650856018, + "kl": 0.030992218293249607, + "learning_rate": 1.9733333333333335e-07, + "loss": 0.0015, + "num_tokens": 2768969.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 34.0, + "completions/mean_terminated_length": 34.0, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 174.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20801319181919098, + "kl": 0.10140269249677658, + "learning_rate": 1.97e-07, + "loss": 0.0048, + "num_tokens": 2769333.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 174.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017625324428081512, + "kl": 0.000512242317199707, + "learning_rate": 1.9666666666666668e-07, + "loss": 0.0, + "num_tokens": 2769545.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 174.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06326346099376678, + "kl": 0.004295312101021409, + "learning_rate": 1.9633333333333333e-07, + "loss": 0.0002, + "num_tokens": 2769814.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 174.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01450822688639164, + "kl": 0.26604562997817993, + "learning_rate": 1.9599999999999998e-07, + "loss": 0.0133, + "num_tokens": 2770118.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 174.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02843433804810047, + "kl": 0.002526771044358611, + "learning_rate": 1.9566666666666668e-07, + "loss": 0.0001, + "num_tokens": 2770422.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 174.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02168596349656582, + "kl": 0.0026514212950132787, + "learning_rate": 1.953333333333333e-07, + "loss": 0.0001, + "num_tokens": 2770710.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 174.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001842870144173503, + "kl": 0.0035613924264907837, + "learning_rate": 1.95e-07, + "loss": 0.0002, + "num_tokens": 2770946.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 174.38888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1765530109405518, + "kl": 0.03562296088784933, + "learning_rate": 1.9466666666666666e-07, + "loss": 0.1266, + "num_tokens": 2771261.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 9417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 27.5, + "completions/mean_terminated_length": 27.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 174.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11406844854354858, + "kl": 0.03590277023613453, + "learning_rate": 1.9433333333333334e-07, + "loss": 0.0017, + "num_tokens": 2771591.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 174.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003839787095785141, + "kl": 0.0001465529203414917, + "learning_rate": 1.94e-07, + "loss": 0.0, + "num_tokens": 2771799.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 27.25, + "completions/mean_terminated_length": 27.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 174.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02580348588526249, + "kl": 0.0061992957489565015, + "learning_rate": 1.936666666666667e-07, + "loss": 0.0003, + "num_tokens": 2772136.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 174.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002588248113170266, + "kl": 0.00010200142423855141, + "learning_rate": 1.9333333333333334e-07, + "loss": 0.0, + "num_tokens": 2772356.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 174.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020115790888667107, + "kl": 0.0062670658044226, + "learning_rate": 1.9300000000000002e-07, + "loss": 0.0003, + "num_tokens": 2772628.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 38.0, + "completions/mean_terminated_length": 38.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 174.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.270094156265259, + "kl": 0.16399965435266495, + "learning_rate": 1.9266666666666667e-07, + "loss": 0.0078, + "num_tokens": 2772996.0, + "reward": 5.75, + "reward_std": 4.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 4.5, + "step": 9423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 174.5185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.376574754714966, + "kl": 0.07148301228880882, + "learning_rate": 1.9233333333333332e-07, + "loss": 0.0804, + "num_tokens": 2773340.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 9424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 174.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009622232057154179, + "kl": 0.0006956764264032245, + "learning_rate": 1.92e-07, + "loss": 0.0, + "num_tokens": 2773654.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 174.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022656532004475594, + "kl": 0.001196057244669646, + "learning_rate": 1.9166666666666665e-07, + "loss": 0.0001, + "num_tokens": 2773932.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 174.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04341169074177742, + "kl": 0.01216940488666296, + "learning_rate": 1.9133333333333335e-07, + "loss": 0.0006, + "num_tokens": 2774193.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 174.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06408816576004028, + "kl": 0.028108091093599796, + "learning_rate": 1.91e-07, + "loss": 0.0014, + "num_tokens": 2774465.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 174.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04262426495552063, + "kl": 0.004996336298063397, + "learning_rate": 1.9066666666666668e-07, + "loss": 0.0002, + "num_tokens": 2774733.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 174.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037320878356695175, + "kl": 0.0019663242273963988, + "learning_rate": 1.9033333333333333e-07, + "loss": 0.0001, + "num_tokens": 2774976.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 174.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000432979897595942, + "kl": 0.0013146064011380076, + "learning_rate": 1.9000000000000003e-07, + "loss": 0.0001, + "num_tokens": 2775253.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 174.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.307594933081418e-05, + "kl": 2.250075340270996e-06, + "learning_rate": 1.8966666666666666e-07, + "loss": 0.0, + "num_tokens": 2775473.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 174.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02457534521818161, + "kl": 0.05075225606560707, + "learning_rate": 1.8933333333333336e-07, + "loss": 0.0025, + "num_tokens": 2775809.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 174.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04618788883090019, + "kl": 0.007547435350716114, + "learning_rate": 1.89e-07, + "loss": 0.0004, + "num_tokens": 2776101.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 174.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04452917352318764, + "kl": 0.011068075662478805, + "learning_rate": 1.8866666666666666e-07, + "loss": 0.0006, + "num_tokens": 2776438.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 34.0, + "completions/mean_terminated_length": 34.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 174.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7525200843811035, + "kl": 0.1560510378330946, + "learning_rate": 1.8833333333333334e-07, + "loss": 0.0045, + "num_tokens": 2776794.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 174.75925925925927, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6089701652526855, + "kl": 0.06847019167616963, + "learning_rate": 1.88e-07, + "loss": 0.0041, + "num_tokens": 2777089.0, + "reward": 5.25, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 2.598076105117798, + "step": 9437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 174.77777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.260293960571289, + "kl": 0.13297653547488153, + "learning_rate": 1.876666666666667e-07, + "loss": 0.0963, + "num_tokens": 2777393.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 9438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 174.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0437699556350708, + "kl": 0.001357494038529694, + "learning_rate": 1.8733333333333332e-07, + "loss": 0.0001, + "num_tokens": 2777627.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 174.8148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.380171060562134, + "kl": 0.06992382183670998, + "learning_rate": 1.87e-07, + "loss": 0.0366, + "num_tokens": 2777934.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 9440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 174.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03872547671198845, + "kl": 0.04057466797530651, + "learning_rate": 1.8666666666666667e-07, + "loss": 0.002, + "num_tokens": 2778338.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 174.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.76997309923172, + "kl": 0.043212915770709515, + "learning_rate": 1.8633333333333335e-07, + "loss": 0.0029, + "num_tokens": 2778620.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 174.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02050241455435753, + "kl": 0.011363848112523556, + "learning_rate": 1.86e-07, + "loss": 0.0006, + "num_tokens": 2778934.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 174.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0011737901950255036, + "kl": 0.000479087233543396, + "learning_rate": 1.8566666666666667e-07, + "loss": 0.0, + "num_tokens": 2779194.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 174.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20299763977527618, + "kl": 0.02106556110084057, + "learning_rate": 1.8533333333333335e-07, + "loss": 0.0013, + "num_tokens": 2779478.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 174.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10031075030565262, + "kl": 0.0112114567309618, + "learning_rate": 1.85e-07, + "loss": 0.0006, + "num_tokens": 2779751.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 174.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0064816526137292385, + "kl": 0.1617884263396263, + "learning_rate": 1.8466666666666668e-07, + "loss": 0.0081, + "num_tokens": 2780060.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 174.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03821108117699623, + "kl": 0.0007517486810684204, + "learning_rate": 1.8433333333333336e-07, + "loss": 0.0, + "num_tokens": 2780316.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 174.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02685854770243168, + "kl": 0.0031300827395170927, + "learning_rate": 1.84e-07, + "loss": 0.0002, + "num_tokens": 2780548.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 175.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0779147669672966, + "kl": 0.010218548122793436, + "learning_rate": 1.8366666666666666e-07, + "loss": 0.0005, + "num_tokens": 2780879.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 175.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016581114381551743, + "kl": 0.00044539570808410645, + "learning_rate": 1.8333333333333333e-07, + "loss": 0.0, + "num_tokens": 2781091.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 26.25, + "completions/mean_terminated_length": 26.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 175.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044473301619291306, + "kl": 0.011632442474365234, + "learning_rate": 1.83e-07, + "loss": 0.0006, + "num_tokens": 2781416.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 175.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1050993874669075, + "kl": 0.023846641182899475, + "learning_rate": 1.8266666666666666e-07, + "loss": 0.0012, + "num_tokens": 2781690.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 175.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07377281785011292, + "kl": 0.006855746265500784, + "learning_rate": 1.8233333333333334e-07, + "loss": 0.0003, + "num_tokens": 2781990.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 175.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08001474291086197, + "kl": 0.008238217793405056, + "learning_rate": 1.8200000000000002e-07, + "loss": 0.0004, + "num_tokens": 2782262.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 175.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0440627820789814, + "kl": 0.03242575004696846, + "learning_rate": 1.8166666666666667e-07, + "loss": 0.0016, + "num_tokens": 2782562.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 175.12962962962962, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.663631439208984, + "kl": 0.04502807557582855, + "learning_rate": 1.8133333333333334e-07, + "loss": 0.0193, + "num_tokens": 2782823.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 9457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 32.5, + "completions/mean_terminated_length": 32.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 175.14814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7345893383026123, + "kl": 0.012640192173421383, + "learning_rate": 1.8100000000000002e-07, + "loss": 0.0245, + "num_tokens": 2783181.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 9458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 175.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00011049036402255297, + "kl": 2.436339855194092e-06, + "learning_rate": 1.8066666666666667e-07, + "loss": 0.0, + "num_tokens": 2783401.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 175.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003071162384003401, + "kl": 0.0004178136441623792, + "learning_rate": 1.8033333333333332e-07, + "loss": 0.0, + "num_tokens": 2783620.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 175.2037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6407954692840576, + "kl": 0.0012169579276815057, + "learning_rate": 1.8e-07, + "loss": 0.0003, + "num_tokens": 2783892.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 9461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 175.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05120643600821495, + "kl": 0.008119082893244922, + "learning_rate": 1.7966666666666667e-07, + "loss": 0.0005, + "num_tokens": 2784192.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 175.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12649521231651306, + "kl": 0.09553695470094681, + "learning_rate": 1.7933333333333332e-07, + "loss": 0.0048, + "num_tokens": 2784561.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 175.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027567535638809204, + "kl": 0.0007620364413014613, + "learning_rate": 1.79e-07, + "loss": 0.0, + "num_tokens": 2784817.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 175.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03359556570649147, + "kl": 0.003997477513621561, + "learning_rate": 1.7866666666666668e-07, + "loss": 0.0002, + "num_tokens": 2785077.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 28.75, + "completions/mean_terminated_length": 28.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 175.2962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.083592891693115, + "kl": 0.10924112051725388, + "learning_rate": 1.7833333333333333e-07, + "loss": 0.0732, + "num_tokens": 2785428.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 9466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 38.25, + "completions/mean_terminated_length": 38.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 175.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05745260789990425, + "kl": 0.03714505583047867, + "learning_rate": 1.78e-07, + "loss": 0.0019, + "num_tokens": 2785805.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 175.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015942562371492386, + "kl": 0.006867463467642665, + "learning_rate": 1.7766666666666668e-07, + "loss": 0.0003, + "num_tokens": 2786101.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 175.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09808575361967087, + "kl": 0.012375690042972565, + "learning_rate": 1.7733333333333336e-07, + "loss": 0.0006, + "num_tokens": 2786394.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 175.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027535833418369293, + "kl": 0.004792378516867757, + "learning_rate": 1.7699999999999998e-07, + "loss": 0.0003, + "num_tokens": 2786733.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 175.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024536410346627235, + "kl": 0.002142351266229525, + "learning_rate": 1.7666666666666666e-07, + "loss": 0.0001, + "num_tokens": 2787060.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 175.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055180296301841736, + "kl": 0.004505733493715525, + "learning_rate": 1.7633333333333334e-07, + "loss": 0.0002, + "num_tokens": 2787330.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 175.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054977238178253174, + "kl": 0.013792762532830238, + "learning_rate": 1.7600000000000001e-07, + "loss": 0.0007, + "num_tokens": 2787659.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 175.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03392626717686653, + "kl": 0.01006307639181614, + "learning_rate": 1.7566666666666666e-07, + "loss": 0.0005, + "num_tokens": 2787941.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 175.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007234607823193073, + "kl": 0.0014080610708333552, + "learning_rate": 1.7533333333333334e-07, + "loss": 0.0001, + "num_tokens": 2788218.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 175.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010728621855378151, + "kl": 0.0011618470889516175, + "learning_rate": 1.7500000000000002e-07, + "loss": 0.0001, + "num_tokens": 2788494.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 175.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020322097465395927, + "kl": 0.002917998470366001, + "learning_rate": 1.7466666666666667e-07, + "loss": 0.0001, + "num_tokens": 2788806.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 175.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028154859319329262, + "kl": 0.03807249292731285, + "learning_rate": 1.7433333333333335e-07, + "loss": 0.0019, + "num_tokens": 2789211.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 175.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010237826965749264, + "kl": 0.007193590514361858, + "learning_rate": 1.7400000000000002e-07, + "loss": 0.0004, + "num_tokens": 2789483.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.75, + "completions/mean_terminated_length": 10.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 175.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06910774111747742, + "kl": 0.006038739811629057, + "learning_rate": 1.7366666666666667e-07, + "loss": 0.0003, + "num_tokens": 2789754.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 38.25, + "completions/mean_terminated_length": 38.25, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 175.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6817761063575745, + "kl": 0.11062488332390785, + "learning_rate": 1.7333333333333332e-07, + "loss": 0.0058, + "num_tokens": 2790135.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 175.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004457696806639433, + "kl": 0.00023955106735229492, + "learning_rate": 1.73e-07, + "loss": 0.0, + "num_tokens": 2790379.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 175.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0049491701647639275, + "kl": 0.0005636787973344326, + "learning_rate": 1.7266666666666668e-07, + "loss": 0.0, + "num_tokens": 2790663.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 175.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07905175536870956, + "kl": 0.00984371779486537, + "learning_rate": 1.7233333333333333e-07, + "loss": 0.0005, + "num_tokens": 2790925.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 7.0, + "completions/mean_terminated_length": 7.0, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 175.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016738688573241234, + "kl": 0.000374002120224759, + "learning_rate": 1.72e-07, + "loss": 0.0, + "num_tokens": 2791161.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9485 + }, + { + "clip_ratio/high_max": 0.007936508394777775, + "clip_ratio/high_mean": 0.007936508394777775, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.007936508394777775, + "completion_length": 33.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 33.5, + "completions/mean_terminated_length": 33.5, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 175.66666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.760868549346924, + "kl": 0.13168331235647202, + "learning_rate": 1.7166666666666668e-07, + "loss": -0.0298, + "num_tokens": 2791511.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 9486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 175.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003948070574551821, + "kl": 0.000350169837474823, + "learning_rate": 1.7133333333333333e-07, + "loss": 0.0, + "num_tokens": 2791771.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 175.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0054215374402701855, + "kl": 0.0015189126133918762, + "learning_rate": 1.71e-07, + "loss": 0.0001, + "num_tokens": 2791987.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 175.72222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.437551259994507, + "kl": 0.10894716624170542, + "learning_rate": 1.7066666666666669e-07, + "loss": 0.0466, + "num_tokens": 2792323.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 9489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 175.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05501972511410713, + "kl": 0.013732909690588713, + "learning_rate": 1.7033333333333334e-07, + "loss": 0.0007, + "num_tokens": 2792610.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 175.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07589367777109146, + "kl": 0.02821679785847664, + "learning_rate": 1.7e-07, + "loss": 0.0015, + "num_tokens": 2792912.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 175.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030610423535108566, + "kl": 0.0006005242466926575, + "learning_rate": 1.6966666666666666e-07, + "loss": 0.0, + "num_tokens": 2793122.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 175.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014374910853803158, + "kl": 0.001605411758646369, + "learning_rate": 1.6933333333333334e-07, + "loss": 0.0001, + "num_tokens": 2793452.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 175.8148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.328324317932129, + "kl": 0.13618933409452438, + "learning_rate": 1.69e-07, + "loss": -0.3253, + "num_tokens": 2793746.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 9494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 175.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030537644401192665, + "kl": 0.0034228264703415334, + "learning_rate": 1.6866666666666667e-07, + "loss": 0.0002, + "num_tokens": 2794004.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 175.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05691957101225853, + "kl": 0.007668401347473264, + "learning_rate": 1.6833333333333335e-07, + "loss": 0.0004, + "num_tokens": 2794326.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 175.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026415415108203888, + "kl": 0.0015674991300329566, + "learning_rate": 1.68e-07, + "loss": 0.0001, + "num_tokens": 2794622.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 175.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016319707501679659, + "kl": 0.00015536861610598862, + "learning_rate": 1.6766666666666667e-07, + "loss": 0.0, + "num_tokens": 2794936.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 175.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046219997107982635, + "kl": 0.006617294391617179, + "learning_rate": 1.6733333333333335e-07, + "loss": 0.0003, + "num_tokens": 2795229.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 175.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012055712286382914, + "kl": 2.7142465114593506e-05, + "learning_rate": 1.6700000000000003e-07, + "loss": 0.0, + "num_tokens": 2795441.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 175.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015128085389733315, + "kl": 0.2658979743719101, + "learning_rate": 1.6666666666666665e-07, + "loss": 0.0133, + "num_tokens": 2795745.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 175.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0355367511510849, + "kl": 0.15592289716005325, + "learning_rate": 1.6633333333333333e-07, + "loss": 0.0078, + "num_tokens": 2796058.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 175.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03484673053026199, + "kl": 0.021474342793226242, + "learning_rate": 1.66e-07, + "loss": 0.0012, + "num_tokens": 2796335.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 176.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0019219344248995185, + "kl": 0.0035479143261909485, + "learning_rate": 1.6566666666666665e-07, + "loss": 0.0002, + "num_tokens": 2796571.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 176.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05576057359576225, + "kl": 0.028212859178893268, + "learning_rate": 1.6533333333333333e-07, + "loss": 0.0014, + "num_tokens": 2796843.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 176.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032508548349142075, + "kl": 0.004489997983910143, + "learning_rate": 1.65e-07, + "loss": 0.0002, + "num_tokens": 2797111.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 32.5, + "completions/mean_terminated_length": 32.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 176.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10765808075666428, + "kl": 0.03259006887674332, + "learning_rate": 1.6466666666666669e-07, + "loss": 0.0016, + "num_tokens": 2797493.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 176.07407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.296911716461182, + "kl": 0.06652853265404701, + "learning_rate": 1.6433333333333334e-07, + "loss": -0.0113, + "num_tokens": 2797778.0, + "reward": 6.125, + "reward_std": 3.4247870445251465, + "rewards/reward_combined/mean": 6.125, + "rewards/reward_combined/std": 3.4247870445251465, + "step": 9508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 176.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014875221066176891, + "kl": 0.0035690803051693365, + "learning_rate": 1.64e-07, + "loss": 0.0001, + "num_tokens": 2798038.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 176.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018377848900854588, + "kl": 0.0035630017518997192, + "learning_rate": 1.636666666666667e-07, + "loss": 0.0002, + "num_tokens": 2798274.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 176.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020880894735455513, + "kl": 0.0007867937456467189, + "learning_rate": 1.6333333333333334e-07, + "loss": 0.0, + "num_tokens": 2798588.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 176.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04607066884636879, + "kl": 0.003699503722600639, + "learning_rate": 1.63e-07, + "loss": 0.0002, + "num_tokens": 2798890.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 176.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017043661326169968, + "kl": 0.012803932186216116, + "learning_rate": 1.6266666666666667e-07, + "loss": 0.0006, + "num_tokens": 2799150.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 176.1851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6555036306381226, + "kl": 0.02057200577110052, + "learning_rate": 1.6233333333333334e-07, + "loss": 0.0007, + "num_tokens": 2799502.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 9514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 176.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027277125045657158, + "kl": 0.0009510765812592581, + "learning_rate": 1.62e-07, + "loss": 0.0, + "num_tokens": 2799737.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 176.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1946507841348648, + "kl": 0.023479865863919258, + "learning_rate": 1.6166666666666667e-07, + "loss": 0.0012, + "num_tokens": 2800013.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 176.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08500733226537704, + "kl": 0.026968365535140038, + "learning_rate": 1.6133333333333335e-07, + "loss": 0.0014, + "num_tokens": 2800301.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 176.25925925925927, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.235119342803955, + "kl": 0.09658414218574762, + "learning_rate": 1.61e-07, + "loss": 0.0434, + "num_tokens": 2800608.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 9518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 176.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07327579706907272, + "kl": 0.011227508570300415, + "learning_rate": 1.6066666666666668e-07, + "loss": 0.0004, + "num_tokens": 2800927.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 34.25, + "completions/mean_terminated_length": 34.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 176.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04222364351153374, + "kl": 0.03229001723229885, + "learning_rate": 1.6033333333333335e-07, + "loss": 0.0017, + "num_tokens": 2801288.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 176.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03203504532575607, + "kl": 0.002618239726871252, + "learning_rate": 1.6e-07, + "loss": 0.0001, + "num_tokens": 2801560.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 176.33333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.730062007904053, + "kl": 0.06036641966784373, + "learning_rate": 1.5966666666666665e-07, + "loss": 0.1174, + "num_tokens": 2801858.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 9522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 34.5, + "completions/mean_terminated_length": 34.5, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 176.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08959072828292847, + "kl": 0.03773561678826809, + "learning_rate": 1.5933333333333333e-07, + "loss": 0.0019, + "num_tokens": 2802276.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 176.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0642068162560463, + "kl": 0.008523573633283377, + "learning_rate": 1.59e-07, + "loss": 0.0004, + "num_tokens": 2802568.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 176.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007662872783839703, + "kl": 0.0014800818171352148, + "learning_rate": 1.5866666666666666e-07, + "loss": 0.0001, + "num_tokens": 2802842.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 176.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028721291571855545, + "kl": 0.01291783805936575, + "learning_rate": 1.5833333333333333e-07, + "loss": 0.0007, + "num_tokens": 2803114.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 176.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04732772707939148, + "kl": 0.001399710774421692, + "learning_rate": 1.58e-07, + "loss": 0.0001, + "num_tokens": 2803330.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 176.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011455900967121124, + "kl": 0.0016176364151760936, + "learning_rate": 1.5766666666666666e-07, + "loss": 0.0001, + "num_tokens": 2803658.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 7.0, + "completions/mean_terminated_length": 7.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 176.46296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.999152183532715, + "kl": 0.7840666137635708, + "learning_rate": 1.5733333333333334e-07, + "loss": -0.1398, + "num_tokens": 2803898.0, + "reward": 3.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 3.875, + "rewards/reward_combined/std": 0.25, + "step": 9529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 176.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.9502185583114624, + "kl": 0.3401731550693512, + "learning_rate": 1.5700000000000002e-07, + "loss": 0.0183, + "num_tokens": 2804220.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 176.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07650213688611984, + "kl": 0.03174298210069537, + "learning_rate": 1.566666666666667e-07, + "loss": 0.0015, + "num_tokens": 2804527.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 176.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10376128554344177, + "kl": 0.00576256331987679, + "learning_rate": 1.5633333333333332e-07, + "loss": 0.0002, + "num_tokens": 2804795.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 176.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09101668000221252, + "kl": 0.01548024226212874, + "learning_rate": 1.56e-07, + "loss": 0.0009, + "num_tokens": 2805095.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 30.5, + "completions/mean_terminated_length": 30.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 176.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09454462677240372, + "kl": 0.039939695969223976, + "learning_rate": 1.5566666666666667e-07, + "loss": 0.0022, + "num_tokens": 2805437.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 176.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005667231045663357, + "kl": 0.1616503745317459, + "learning_rate": 1.5533333333333332e-07, + "loss": 0.0081, + "num_tokens": 2805746.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 39.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 47.0, + "completions/max_terminated_length": 47.0, + "completions/mean_length": 39.5, + "completions/mean_terminated_length": 39.5, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 176.59259259259258, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5714551210403442, + "kl": 0.07886955514550209, + "learning_rate": 1.55e-07, + "loss": 0.0049, + "num_tokens": 2806120.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 9536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 176.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033408455550670624, + "kl": 0.008181490702554584, + "learning_rate": 1.5466666666666668e-07, + "loss": 0.0005, + "num_tokens": 2806447.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 176.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03365100175142288, + "kl": 0.0007914025336503983, + "learning_rate": 1.5433333333333335e-07, + "loss": 0.0, + "num_tokens": 2806704.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 176.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03636021167039871, + "kl": 0.00307451281696558, + "learning_rate": 1.54e-07, + "loss": 0.0002, + "num_tokens": 2807016.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 176.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005419056862592697, + "kl": 0.00020385532843647525, + "learning_rate": 1.5366666666666668e-07, + "loss": 0.0, + "num_tokens": 2807288.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 176.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12775497138500214, + "kl": 0.013672416796907783, + "learning_rate": 1.5333333333333336e-07, + "loss": 0.0007, + "num_tokens": 2807624.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 176.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029871614649891853, + "kl": 0.003635496774222702, + "learning_rate": 1.53e-07, + "loss": 0.0002, + "num_tokens": 2807882.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 176.72222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.061885356903076, + "kl": 0.13871409744024277, + "learning_rate": 1.5266666666666666e-07, + "loss": 0.0086, + "num_tokens": 2808254.0, + "reward": 2.875, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 1.25, + "step": 9543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 176.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005721048917621374, + "kl": 0.0005310453125275671, + "learning_rate": 1.5233333333333333e-07, + "loss": 0.0, + "num_tokens": 2808538.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 176.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.8619801014428958e-05, + "kl": 2.3692846298217773e-06, + "learning_rate": 1.52e-07, + "loss": 0.0, + "num_tokens": 2808758.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 176.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07172656804323196, + "kl": 0.00579028413631022, + "learning_rate": 1.5166666666666666e-07, + "loss": 0.0004, + "num_tokens": 2808989.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 176.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07688991725444794, + "kl": 0.003436783794313669, + "learning_rate": 1.5133333333333334e-07, + "loss": 0.0002, + "num_tokens": 2809253.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 176.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0055934228003025055, + "kl": 0.00033292174339294434, + "learning_rate": 1.5100000000000002e-07, + "loss": 0.0, + "num_tokens": 2809461.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 176.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06430573761463165, + "kl": 0.0026778578758239746, + "learning_rate": 1.5066666666666667e-07, + "loss": 0.0001, + "num_tokens": 2809721.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9549 + }, + { + "clip_ratio/high_max": 0.008771929889917374, + "clip_ratio/high_mean": 0.008771929889917374, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008771929889917374, + "completion_length": 33.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 33.25, + "completions/mean_terminated_length": 33.25, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 176.85185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.503368377685547, + "kl": 0.4967469163239002, + "learning_rate": 1.5033333333333334e-07, + "loss": 0.0086, + "num_tokens": 2810090.0, + "reward": 3.375, + "reward_std": 4.8712592124938965, + "rewards/reward_combined/mean": 3.375, + "rewards/reward_combined/std": 4.8712592124938965, + "step": 9550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 176.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029600251466035843, + "kl": 0.0017618819620111026, + "learning_rate": 1.5000000000000002e-07, + "loss": 0.0001, + "num_tokens": 2810309.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 176.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008202710188925266, + "kl": 0.0003418431442696601, + "learning_rate": 1.4966666666666667e-07, + "loss": 0.0, + "num_tokens": 2810626.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 34.0, + "completions/mean_terminated_length": 34.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 176.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031754862517118454, + "kl": 0.007476957864128053, + "learning_rate": 1.4933333333333332e-07, + "loss": 0.0003, + "num_tokens": 2810982.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 176.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0024407673627138138, + "kl": 1.6644597053527832e-05, + "learning_rate": 1.49e-07, + "loss": 0.0, + "num_tokens": 2811194.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 176.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03923211246728897, + "kl": 0.004935101140290499, + "learning_rate": 1.4866666666666667e-07, + "loss": 0.0002, + "num_tokens": 2811494.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 176.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046997226774692535, + "kl": 0.0022775634424760938, + "learning_rate": 1.4833333333333332e-07, + "loss": 0.0001, + "num_tokens": 2811748.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 176.9814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.456135272979736, + "kl": 0.03165629622526467, + "learning_rate": 1.48e-07, + "loss": 0.0128, + "num_tokens": 2812045.0, + "reward": 5.25, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 2.598076105117798, + "step": 9557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 38.25, + "completions/mean_terminated_length": 38.25, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 177.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6208432912826538, + "kl": 0.09374172985553741, + "learning_rate": 1.4766666666666668e-07, + "loss": -0.0305, + "num_tokens": 2812426.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 9558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 177.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036625511944293976, + "kl": 0.003062780946493149, + "learning_rate": 1.4733333333333333e-07, + "loss": 0.0002, + "num_tokens": 2812738.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 177.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13599556684494019, + "kl": 0.0067896172404289246, + "learning_rate": 1.47e-07, + "loss": 0.0003, + "num_tokens": 2812982.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 177.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02631358802318573, + "kl": 0.0012416014797054231, + "learning_rate": 1.4666666666666668e-07, + "loss": 0.0001, + "num_tokens": 2813272.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 35.5, + "completions/mean_terminated_length": 35.5, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 177.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12596699595451355, + "kl": 0.06198902800679207, + "learning_rate": 1.4633333333333336e-07, + "loss": 0.0031, + "num_tokens": 2813642.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 177.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019427374005317688, + "kl": 0.0008019626038731076, + "learning_rate": 1.4599999999999998e-07, + "loss": 0.0, + "num_tokens": 2813898.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 177.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04771993309259415, + "kl": 0.026112916879355907, + "learning_rate": 1.4566666666666666e-07, + "loss": 0.0014, + "num_tokens": 2814186.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 177.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23239776492118835, + "kl": 0.041260702069848776, + "learning_rate": 1.4533333333333334e-07, + "loss": 0.002, + "num_tokens": 2814482.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 177.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06313328444957733, + "kl": 0.004499338800087571, + "learning_rate": 1.45e-07, + "loss": 0.0002, + "num_tokens": 2814746.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 177.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0063460092060267925, + "kl": 0.16387245059013367, + "learning_rate": 1.4466666666666667e-07, + "loss": 0.0082, + "num_tokens": 2815054.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.25, + "completions/mean_terminated_length": 12.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 177.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07178795337677002, + "kl": 0.005559865618124604, + "learning_rate": 1.4433333333333334e-07, + "loss": 0.0003, + "num_tokens": 2815331.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 177.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0264766663312912, + "kl": 0.0019895622390322387, + "learning_rate": 1.4400000000000002e-07, + "loss": 0.0001, + "num_tokens": 2815631.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 177.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019687240943312645, + "kl": 0.09615558013319969, + "learning_rate": 1.4366666666666667e-07, + "loss": 0.0048, + "num_tokens": 2816003.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 177.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003190296469256282, + "kl": 0.0001267552434001118, + "learning_rate": 1.4333333333333335e-07, + "loss": 0.0, + "num_tokens": 2816223.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 38.0, + "completions/mean_terminated_length": 38.0, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 177.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06155973672866821, + "kl": 0.05993828922510147, + "learning_rate": 1.4300000000000002e-07, + "loss": 0.003, + "num_tokens": 2816591.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 177.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005751395132392645, + "kl": 0.0003493606927804649, + "learning_rate": 1.4266666666666667e-07, + "loss": 0.0, + "num_tokens": 2816851.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 28.75, + "completions/mean_terminated_length": 28.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 177.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057162024080753326, + "kl": 0.011976105161011219, + "learning_rate": 1.4233333333333332e-07, + "loss": 0.0006, + "num_tokens": 2817194.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 177.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.441741036018357e-05, + "kl": 2.294778823852539e-06, + "learning_rate": 1.42e-07, + "loss": 0.0, + "num_tokens": 2817414.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 177.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027738021686673164, + "kl": 0.026988955214619637, + "learning_rate": 1.4166666666666668e-07, + "loss": 0.0014, + "num_tokens": 2817716.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 177.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011388730257749557, + "kl": 0.011602215701714158, + "learning_rate": 1.4133333333333333e-07, + "loss": 0.0007, + "num_tokens": 2817990.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 177.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017021670937538147, + "kl": 0.0005283690989017487, + "learning_rate": 1.41e-07, + "loss": 0.0, + "num_tokens": 2818250.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 177.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08981374651193619, + "kl": 0.012530907988548279, + "learning_rate": 1.4066666666666668e-07, + "loss": 0.0007, + "num_tokens": 2818536.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 177.40740740740742, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.168918609619141, + "kl": 0.15909979492425919, + "learning_rate": 1.4033333333333333e-07, + "loss": 0.2073, + "num_tokens": 2818828.0, + "reward": 6.125, + "reward_std": 3.75, + "rewards/reward_combined/mean": 6.125, + "rewards/reward_combined/std": 3.75, + "step": 9580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.75, + "completions/mean_terminated_length": 9.75, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 177.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 12.229592323303223, + "kl": 1.2701507389429025, + "learning_rate": 1.4e-07, + "loss": 0.0602, + "num_tokens": 2819087.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 177.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011176024563610554, + "kl": 0.0072957759257406, + "learning_rate": 1.3966666666666669e-07, + "loss": 0.0004, + "num_tokens": 2819359.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 177.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030984047800302505, + "kl": 0.005335319088771939, + "learning_rate": 1.3933333333333334e-07, + "loss": 0.0003, + "num_tokens": 2819654.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 177.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09007994830608368, + "kl": 0.0016735196113586426, + "learning_rate": 1.39e-07, + "loss": 0.0001, + "num_tokens": 2819874.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 177.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07154640555381775, + "kl": 0.004637794831069186, + "learning_rate": 1.3866666666666666e-07, + "loss": 0.0002, + "num_tokens": 2820148.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 177.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20592337846755981, + "kl": 0.01621579035418108, + "learning_rate": 1.3833333333333334e-07, + "loss": 0.0009, + "num_tokens": 2820434.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 177.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017338179051876068, + "kl": 0.012760586105287075, + "learning_rate": 1.38e-07, + "loss": 0.0006, + "num_tokens": 2820694.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 177.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019098149612545967, + "kl": 0.0020502295228652656, + "learning_rate": 1.3766666666666667e-07, + "loss": 0.0001, + "num_tokens": 2820964.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 177.57407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17156332731246948, + "kl": 0.37659671571600484, + "learning_rate": 1.3733333333333335e-07, + "loss": 0.0039, + "num_tokens": 2821275.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 9589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 30.0, + "completions/mean_terminated_length": 30.0, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 177.59259259259258, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4867427349090576, + "kl": 0.06598281487822533, + "learning_rate": 1.37e-07, + "loss": -0.0335, + "num_tokens": 2821675.0, + "reward": 2.375, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 1.25, + "step": 9590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 2.5, + "completions/mean_terminated_length": 2.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 177.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017378803342580795, + "kl": 0.0003652125597000122, + "learning_rate": 1.3666666666666667e-07, + "loss": 0.0, + "num_tokens": 2821881.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 177.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11058441549539566, + "kl": 0.01982554141432047, + "learning_rate": 1.3633333333333335e-07, + "loss": 0.0012, + "num_tokens": 2822160.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 177.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02188039757311344, + "kl": 0.010243285563774407, + "learning_rate": 1.36e-07, + "loss": 0.0004, + "num_tokens": 2822483.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 177.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07262875884771347, + "kl": 0.01694716513156891, + "learning_rate": 1.3566666666666665e-07, + "loss": 0.0008, + "num_tokens": 2822809.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 177.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01224533375352621, + "kl": 0.0004906567046418786, + "learning_rate": 1.3533333333333333e-07, + "loss": 0.0, + "num_tokens": 2823132.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 177.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002286963164806366, + "kl": 2.419203519821167e-05, + "learning_rate": 1.35e-07, + "loss": 0.0, + "num_tokens": 2823344.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 177.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004637403879314661, + "kl": 0.0001556376664666459, + "learning_rate": 1.3466666666666665e-07, + "loss": 0.0, + "num_tokens": 2823616.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 177.74074074074073, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.643313407897949, + "kl": 0.02667102124541998, + "learning_rate": 1.3433333333333333e-07, + "loss": 0.0297, + "num_tokens": 2823951.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 9598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 8.25, + "completions/mean_terminated_length": 8.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 177.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025852812454104424, + "kl": 0.002269966993480921, + "learning_rate": 1.34e-07, + "loss": 0.0001, + "num_tokens": 2824184.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 177.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1519833207130432, + "kl": 0.03485689498484135, + "learning_rate": 1.3366666666666669e-07, + "loss": 0.0018, + "num_tokens": 2824522.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 177.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003160873893648386, + "kl": 0.0012445024913176894, + "learning_rate": 1.3333333333333334e-07, + "loss": 0.0001, + "num_tokens": 2824802.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 177.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04331769049167633, + "kl": 0.0060004518600180745, + "learning_rate": 1.33e-07, + "loss": 0.0003, + "num_tokens": 2825122.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 177.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029098965227603912, + "kl": 0.0016439953760709614, + "learning_rate": 1.326666666666667e-07, + "loss": 0.0001, + "num_tokens": 2825357.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 26.75, + "completions/mean_terminated_length": 26.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 177.85185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8213188648223877, + "kl": 0.03336055390536785, + "learning_rate": 1.3233333333333331e-07, + "loss": 0.0282, + "num_tokens": 2825684.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 9604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 177.87037037037038, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.9011693000793457, + "kl": 0.08182980120182037, + "learning_rate": 1.32e-07, + "loss": -0.0125, + "num_tokens": 2825987.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 9605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 177.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09659013897180557, + "kl": 0.017395183676853776, + "learning_rate": 1.3166666666666667e-07, + "loss": 0.0009, + "num_tokens": 2826321.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 177.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.064116932451725, + "kl": 0.048350848257541656, + "learning_rate": 1.3133333333333334e-07, + "loss": 0.0024, + "num_tokens": 2826659.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 177.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06287820637226105, + "kl": 0.005229388130828738, + "learning_rate": 1.31e-07, + "loss": 0.0003, + "num_tokens": 2826957.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 177.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030907947570085526, + "kl": 0.0037443265318870544, + "learning_rate": 1.3066666666666667e-07, + "loss": 0.0002, + "num_tokens": 2827248.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 34.25, + "completions/mean_terminated_length": 34.25, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 177.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02483222633600235, + "kl": 0.018993492238223553, + "learning_rate": 1.3033333333333335e-07, + "loss": 0.001, + "num_tokens": 2827609.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 177.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0145722646266222, + "kl": 0.2659342437982559, + "learning_rate": 1.3e-07, + "loss": 0.0133, + "num_tokens": 2827913.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 178.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0019047962268814445, + "kl": 0.0035469159483909607, + "learning_rate": 1.2966666666666668e-07, + "loss": 0.0002, + "num_tokens": 2828149.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 178.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03985825553536415, + "kl": 0.007996839005500078, + "learning_rate": 1.2933333333333335e-07, + "loss": 0.0004, + "num_tokens": 2828438.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 38.25, + "completions/mean_terminated_length": 38.25, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 178.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06788503378629684, + "kl": 0.06598960235714912, + "learning_rate": 1.29e-07, + "loss": 0.0033, + "num_tokens": 2828819.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 178.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1396779716014862, + "kl": 0.018114380538463593, + "learning_rate": 1.2866666666666665e-07, + "loss": 0.001, + "num_tokens": 2829109.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 178.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19356422126293182, + "kl": 0.02031507482752204, + "learning_rate": 1.2833333333333333e-07, + "loss": 0.0011, + "num_tokens": 2829429.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 178.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058159466832876205, + "kl": 0.0021013430086895823, + "learning_rate": 1.28e-07, + "loss": 0.0001, + "num_tokens": 2829723.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 178.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027771631255745888, + "kl": 0.03797489311546087, + "learning_rate": 1.2766666666666666e-07, + "loss": 0.0019, + "num_tokens": 2830128.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 178.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010564511641860008, + "kl": 0.000665765255689621, + "learning_rate": 1.2733333333333334e-07, + "loss": 0.0, + "num_tokens": 2830388.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 178.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028403762727975845, + "kl": 0.004304815316572785, + "learning_rate": 1.27e-07, + "loss": 0.0002, + "num_tokens": 2830715.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 178.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0008022031397558749, + "kl": 0.0013871045666746795, + "learning_rate": 1.2666666666666666e-07, + "loss": 0.0001, + "num_tokens": 2830989.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 178.1851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2146918773651123, + "kl": 0.08477691747248173, + "learning_rate": 1.2633333333333334e-07, + "loss": 0.0643, + "num_tokens": 2831322.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 9622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 178.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028722399845719337, + "kl": 0.004485728684812784, + "learning_rate": 1.2600000000000002e-07, + "loss": 0.0002, + "num_tokens": 2831620.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 178.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14467935264110565, + "kl": 0.16932255029678345, + "learning_rate": 1.2566666666666667e-07, + "loss": 0.0085, + "num_tokens": 2831936.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 178.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033596307039260864, + "kl": 0.0013881406630389392, + "learning_rate": 1.2533333333333332e-07, + "loss": 0.0001, + "num_tokens": 2832208.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 178.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014502130448818207, + "kl": 0.2659769505262375, + "learning_rate": 1.25e-07, + "loss": 0.0133, + "num_tokens": 2832512.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 178.27777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.6548380851745605, + "kl": 0.02610295871272683, + "learning_rate": 1.2466666666666667e-07, + "loss": -0.0137, + "num_tokens": 2832817.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 9627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 178.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011660748161375523, + "kl": 0.00016899704860406928, + "learning_rate": 1.2433333333333332e-07, + "loss": 0.0, + "num_tokens": 2833073.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 178.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06711766123771667, + "kl": 0.003436968778260052, + "learning_rate": 1.24e-07, + "loss": 0.0002, + "num_tokens": 2833340.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 5.5, + "completions/mean_terminated_length": 5.5, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 178.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02088790573179722, + "kl": 0.0003516205833875574, + "learning_rate": 1.2366666666666668e-07, + "loss": 0.0, + "num_tokens": 2833562.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 33.5, + "completions/mean_terminated_length": 33.5, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 178.35185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5831379890441895, + "kl": 0.18136628530919552, + "learning_rate": 1.2333333333333335e-07, + "loss": -0.0158, + "num_tokens": 2833920.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 9631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 178.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022614121437072754, + "kl": 0.001610791718121618, + "learning_rate": 1.23e-07, + "loss": 0.0001, + "num_tokens": 2834180.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 178.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02911880612373352, + "kl": 0.00682230263191741, + "learning_rate": 1.2266666666666668e-07, + "loss": 0.0003, + "num_tokens": 2834452.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 178.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7033159136772156, + "kl": 0.0785313555970788, + "learning_rate": 1.2233333333333336e-07, + "loss": 0.0067, + "num_tokens": 2834717.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 178.42592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.678153991699219, + "kl": 0.010550772189162672, + "learning_rate": 1.2199999999999998e-07, + "loss": 0.2468, + "num_tokens": 2834937.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 9635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 178.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024378197267651558, + "kl": 0.005471828859299421, + "learning_rate": 1.2166666666666666e-07, + "loss": 0.0003, + "num_tokens": 2835278.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 178.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017940865829586983, + "kl": 0.0008927244052756578, + "learning_rate": 1.2133333333333333e-07, + "loss": 0.0, + "num_tokens": 2835560.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 178.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025685062631964684, + "kl": 0.0010303754970664158, + "learning_rate": 1.21e-07, + "loss": 0.0001, + "num_tokens": 2835872.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 178.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12392605096101761, + "kl": 0.03731744363903999, + "learning_rate": 1.2066666666666666e-07, + "loss": 0.0019, + "num_tokens": 2836174.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 178.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.473946880054427e-06, + "kl": 1.9222497940063477e-06, + "learning_rate": 1.2033333333333334e-07, + "loss": 0.0, + "num_tokens": 2836394.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 178.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012653307989239693, + "kl": 0.0018477363046258688, + "learning_rate": 1.2000000000000002e-07, + "loss": 0.0001, + "num_tokens": 2836708.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 178.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03158792480826378, + "kl": 0.019087360240519047, + "learning_rate": 1.1966666666666667e-07, + "loss": 0.0011, + "num_tokens": 2836988.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 178.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05164724215865135, + "kl": 0.0016252377245109528, + "learning_rate": 1.1933333333333334e-07, + "loss": 0.0001, + "num_tokens": 2837222.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9643 + }, + { + "clip_ratio/high_max": 0.01515151560306549, + "clip_ratio/high_mean": 0.01515151560306549, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01515151560306549, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 178.59259259259258, + "frac_reward_zero_std": 0.0, + "grad_norm": 12.528735160827637, + "kl": 0.027745794504880905, + "learning_rate": 1.1900000000000001e-07, + "loss": 0.1967, + "num_tokens": 2837502.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 9644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 178.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.5991106033325195, + "kl": 0.1108491700142622, + "learning_rate": 1.1866666666666666e-07, + "loss": 0.2138, + "num_tokens": 2837811.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 9645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 178.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018112411722540855, + "kl": 0.012603074312210083, + "learning_rate": 1.1833333333333333e-07, + "loss": 0.0006, + "num_tokens": 2838071.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 178.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012119478546082973, + "kl": 0.001166579604614526, + "learning_rate": 1.18e-07, + "loss": 0.0001, + "num_tokens": 2838345.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 30.25, + "completions/mean_terminated_length": 30.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 178.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03568000718951225, + "kl": 0.009528001770377159, + "learning_rate": 1.1766666666666666e-07, + "loss": 0.0005, + "num_tokens": 2838694.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 178.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003912889398634434, + "kl": 0.00022037699818611145, + "learning_rate": 1.1733333333333334e-07, + "loss": 0.0, + "num_tokens": 2838938.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 178.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06269144266843796, + "kl": 0.020347768906503916, + "learning_rate": 1.17e-07, + "loss": 0.001, + "num_tokens": 2839210.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 178.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03186080977320671, + "kl": 0.00934227230027318, + "learning_rate": 1.1666666666666667e-07, + "loss": 0.0005, + "num_tokens": 2839529.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 26.75, + "completions/mean_terminated_length": 26.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 178.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03496990725398064, + "kl": 0.017681284807622433, + "learning_rate": 1.1633333333333334e-07, + "loss": 0.0008, + "num_tokens": 2839856.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 178.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15210020542144775, + "kl": 0.019458720460534096, + "learning_rate": 1.16e-07, + "loss": 0.001, + "num_tokens": 2840140.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 27.25, + "completions/mean_terminated_length": 27.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 178.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052592452615499496, + "kl": 0.05388986878097057, + "learning_rate": 1.1566666666666668e-07, + "loss": 0.0027, + "num_tokens": 2840485.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 178.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10374416410923004, + "kl": 0.030501834116876125, + "learning_rate": 1.1533333333333335e-07, + "loss": 0.0016, + "num_tokens": 2840797.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 178.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022376513108611107, + "kl": 0.0018261033692397177, + "learning_rate": 1.15e-07, + "loss": 0.0001, + "num_tokens": 2841093.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 178.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010112726129591465, + "kl": 0.00042986913467757404, + "learning_rate": 1.1466666666666666e-07, + "loss": 0.0, + "num_tokens": 2841411.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 178.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001916148466989398, + "kl": 1.3880431652069092e-05, + "learning_rate": 1.1433333333333332e-07, + "loss": 0.0, + "num_tokens": 2841623.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 178.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006173481699079275, + "kl": 0.00018824636936187744, + "learning_rate": 1.14e-07, + "loss": 0.0, + "num_tokens": 2841831.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 178.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012239635922014713, + "kl": 0.000723128963727504, + "learning_rate": 1.1366666666666667e-07, + "loss": 0.0, + "num_tokens": 2842099.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 178.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018316024215891957, + "kl": 0.0035710036754608154, + "learning_rate": 1.1333333333333334e-07, + "loss": 0.0002, + "num_tokens": 2842335.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 178.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0027970748487859964, + "kl": 0.0003119918255833909, + "learning_rate": 1.13e-07, + "loss": 0.0, + "num_tokens": 2842597.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 178.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01072064321488142, + "kl": 0.007303948746994138, + "learning_rate": 1.1266666666666667e-07, + "loss": 0.0004, + "num_tokens": 2842869.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 178.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023765217512845993, + "kl": 0.07397115416824818, + "learning_rate": 1.1233333333333335e-07, + "loss": 0.0037, + "num_tokens": 2843239.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 178.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06605005264282227, + "kl": 0.020146341994404793, + "learning_rate": 1.1200000000000001e-07, + "loss": 0.001, + "num_tokens": 2843541.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 38.0, + "completions/mean_terminated_length": 38.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 179.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9219770431518555, + "kl": 0.07599946111440659, + "learning_rate": 1.1166666666666666e-07, + "loss": 0.0102, + "num_tokens": 2843909.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 9666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 179.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018040143186226487, + "kl": 0.0035718977451324463, + "learning_rate": 1.1133333333333332e-07, + "loss": 0.0002, + "num_tokens": 2844145.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 179.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014750920236110687, + "kl": 0.0017863232642412186, + "learning_rate": 1.11e-07, + "loss": 0.0001, + "num_tokens": 2844457.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 179.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11244037002325058, + "kl": 0.006736365205142647, + "learning_rate": 1.1066666666666667e-07, + "loss": 0.0003, + "num_tokens": 2844753.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 179.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001369985518977046, + "kl": 3.6619603633880615e-05, + "learning_rate": 1.1033333333333333e-07, + "loss": 0.0, + "num_tokens": 2844965.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 179.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08508226275444031, + "kl": 0.02371902298182249, + "learning_rate": 1.1e-07, + "loss": 0.0012, + "num_tokens": 2845238.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 179.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057099487632513046, + "kl": 0.007750753313302994, + "learning_rate": 1.0966666666666667e-07, + "loss": 0.0004, + "num_tokens": 2845531.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 179.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.30677762627601624, + "kl": 0.031641120091080666, + "learning_rate": 1.0933333333333335e-07, + "loss": 0.0019, + "num_tokens": 2845808.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 179.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0927388146519661, + "kl": 0.004070740658789873, + "learning_rate": 1.0900000000000001e-07, + "loss": 0.0002, + "num_tokens": 2846064.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 179.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002588885836303234, + "kl": 0.0004719384014606476, + "learning_rate": 1.0866666666666667e-07, + "loss": 0.0, + "num_tokens": 2846324.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.75, + "completions/mean_terminated_length": 22.75, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 179.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02736266329884529, + "kl": 0.0034303624415770173, + "learning_rate": 1.0833333333333332e-07, + "loss": 0.0002, + "num_tokens": 2846663.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 179.2037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.112271308898926, + "kl": 0.04132060831761919, + "learning_rate": 1.08e-07, + "loss": 0.0487, + "num_tokens": 2846948.0, + "reward": 7.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 7.25, + "rewards/reward_combined/std": 1.5, + "step": 9677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 179.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017123788595199585, + "kl": 0.012875073589384556, + "learning_rate": 1.0766666666666666e-07, + "loss": 0.0006, + "num_tokens": 2847208.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 179.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005060833413153887, + "kl": 0.0001884127632365562, + "learning_rate": 1.0733333333333333e-07, + "loss": 0.0, + "num_tokens": 2847480.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 179.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10962368547916412, + "kl": 0.018941693706437945, + "learning_rate": 1.07e-07, + "loss": 0.001, + "num_tokens": 2847804.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 179.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03470403328537941, + "kl": 0.0005734115839004517, + "learning_rate": 1.0666666666666667e-07, + "loss": 0.0, + "num_tokens": 2848048.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 179.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08530457317829132, + "kl": 0.034948455169796944, + "learning_rate": 1.0633333333333333e-07, + "loss": 0.0017, + "num_tokens": 2848383.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 179.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02458912320435047, + "kl": 0.0122128298971802, + "learning_rate": 1.0600000000000001e-07, + "loss": 0.0007, + "num_tokens": 2848657.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 179.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04479043185710907, + "kl": 0.035796113312244415, + "learning_rate": 1.0566666666666667e-07, + "loss": 0.0018, + "num_tokens": 2848959.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 179.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07037424296140671, + "kl": 0.006918626604601741, + "learning_rate": 1.0533333333333335e-07, + "loss": 0.0003, + "num_tokens": 2849221.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 179.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021594589576125145, + "kl": 0.0006519470916828141, + "learning_rate": 1.0500000000000001e-07, + "loss": 0.0, + "num_tokens": 2849456.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 179.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08782421797513962, + "kl": 0.01188184879720211, + "learning_rate": 1.0466666666666666e-07, + "loss": 0.0005, + "num_tokens": 2849728.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 179.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015624243766069412, + "kl": 0.0013450205442495644, + "learning_rate": 1.0433333333333333e-07, + "loss": 0.0001, + "num_tokens": 2849988.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.007462686393409967, + "clip_ratio/low_min": 0.007462686393409967, + "clip_ratio/region_mean": 0.007462686393409967, + "completion_length": 29.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 45.0, + "completions/max_terminated_length": 45.0, + "completions/mean_length": 29.75, + "completions/mean_terminated_length": 29.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 179.42592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.651930809020996, + "kl": 0.027714114636182785, + "learning_rate": 1.0399999999999999e-07, + "loss": 0.1709, + "num_tokens": 2850335.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 9689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 179.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03454925864934921, + "kl": 0.025273829407524318, + "learning_rate": 1.0366666666666667e-07, + "loss": 0.0013, + "num_tokens": 2850624.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 179.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3167155385017395, + "kl": 0.0384307811036706, + "learning_rate": 1.0333333333333333e-07, + "loss": 0.002, + "num_tokens": 2850927.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 39.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 39.5, + "completions/mean_terminated_length": 39.5, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 179.4814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.102170944213867, + "kl": 0.1525644026696682, + "learning_rate": 1.0300000000000001e-07, + "loss": 0.0182, + "num_tokens": 2851301.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 9692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 179.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02888142503798008, + "kl": 0.0022603245452046394, + "learning_rate": 1.0266666666666667e-07, + "loss": 0.0001, + "num_tokens": 2851601.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 179.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014978638850152493, + "kl": 0.26587051153182983, + "learning_rate": 1.0233333333333334e-07, + "loss": 0.0133, + "num_tokens": 2851905.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 179.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00048795342445373535, + "kl": 0.0013157953508198261, + "learning_rate": 1.0200000000000001e-07, + "loss": 0.0001, + "num_tokens": 2852182.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 24.25, + "completions/mean_terminated_length": 24.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 179.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026828717440366745, + "kl": 0.04668557830154896, + "learning_rate": 1.0166666666666668e-07, + "loss": 0.0023, + "num_tokens": 2852515.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 179.57407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.440510272979736, + "kl": 0.42124919034540653, + "learning_rate": 1.0133333333333333e-07, + "loss": 0.2074, + "num_tokens": 2852813.0, + "reward": 6.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 6.5, + "rewards/reward_combined/std": 2.0, + "step": 9697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 179.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08777348697185516, + "kl": 0.0058256154879927635, + "learning_rate": 1.0099999999999999e-07, + "loss": 0.0004, + "num_tokens": 2853028.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 179.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23188771307468414, + "kl": 0.07376392185688019, + "learning_rate": 1.0066666666666667e-07, + "loss": 0.0034, + "num_tokens": 2853336.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 179.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1250053197145462, + "kl": 0.008664275519549847, + "learning_rate": 1.0033333333333333e-07, + "loss": 0.0004, + "num_tokens": 2853662.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.25, + "completions/mean_terminated_length": 12.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 179.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1083100438117981, + "kl": 0.004676389042288065, + "learning_rate": 1e-07, + "loss": 0.0002, + "num_tokens": 2853923.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 179.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007782944943755865, + "kl": 0.16182966530323029, + "learning_rate": 9.966666666666667e-08, + "loss": 0.0081, + "num_tokens": 2854232.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 179.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003936781547963619, + "kl": 0.0013590991293312982, + "learning_rate": 9.933333333333334e-08, + "loss": 0.0001, + "num_tokens": 2854451.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 179.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0384286493062973, + "kl": 0.005321982316672802, + "learning_rate": 9.900000000000001e-08, + "loss": 0.0003, + "num_tokens": 2854788.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 37.75, + "completions/mean_terminated_length": 37.75, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 179.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4603114426136017, + "kl": 0.0931578278541565, + "learning_rate": 9.866666666666668e-08, + "loss": 0.0047, + "num_tokens": 2855167.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 179.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10489687323570251, + "kl": 0.0028638184594456106, + "learning_rate": 9.833333333333334e-08, + "loss": 0.0002, + "num_tokens": 2855494.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 32.0, + "completions/mean_terminated_length": 32.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 179.75925925925927, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2989505529403687, + "kl": 0.13962608575820923, + "learning_rate": 9.799999999999999e-08, + "loss": 0.0296, + "num_tokens": 2855902.0, + "reward": 2.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 2.875, + "rewards/reward_combined/std": 0.25, + "step": 9707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 179.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047474656254053116, + "kl": 0.006973778363317251, + "learning_rate": 9.766666666666665e-08, + "loss": 0.0003, + "num_tokens": 2856208.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 179.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002819453366100788, + "kl": 0.0001382927075610496, + "learning_rate": 9.733333333333333e-08, + "loss": 0.0, + "num_tokens": 2856520.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 179.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022756028920412064, + "kl": 0.0007744921022094786, + "learning_rate": 9.7e-08, + "loss": 0.0, + "num_tokens": 2856840.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 179.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03172854706645012, + "kl": 0.005972646409645677, + "learning_rate": 9.666666666666667e-08, + "loss": 0.0003, + "num_tokens": 2857128.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 179.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7800384163856506, + "kl": 0.14649273082613945, + "learning_rate": 9.633333333333334e-08, + "loss": 0.0069, + "num_tokens": 2857429.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 179.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004654505755752325, + "kl": 0.000758931040763855, + "learning_rate": 9.6e-08, + "loss": 0.0, + "num_tokens": 2857645.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 179.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047067224979400635, + "kl": 0.003557793330401182, + "learning_rate": 9.566666666666668e-08, + "loss": 0.0002, + "num_tokens": 2857918.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 179.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.60748017556034e-05, + "kl": 2.1085143089294434e-06, + "learning_rate": 9.533333333333334e-08, + "loss": 0.0, + "num_tokens": 2858138.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 179.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05438502877950668, + "kl": 0.0047757600113982335, + "learning_rate": 9.500000000000002e-08, + "loss": 0.0002, + "num_tokens": 2858398.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 179.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02562764845788479, + "kl": 0.09701839089393616, + "learning_rate": 9.466666666666668e-08, + "loss": 0.0049, + "num_tokens": 2858770.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 37.25, + "completions/mean_terminated_length": 37.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 179.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031806401908397675, + "kl": 0.019666369073092937, + "learning_rate": 9.433333333333333e-08, + "loss": 0.001, + "num_tokens": 2859143.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 179.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11035704612731934, + "kl": 0.011846620822325349, + "learning_rate": 9.4e-08, + "loss": 0.0007, + "num_tokens": 2859430.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 180.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05384654179215431, + "kl": 0.0030358732328750193, + "learning_rate": 9.366666666666666e-08, + "loss": 0.0001, + "num_tokens": 2859704.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 180.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015693986788392067, + "kl": 0.0007912683067843318, + "learning_rate": 9.333333333333334e-08, + "loss": 0.0, + "num_tokens": 2860033.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 23.25, + "completions/mean_terminated_length": 23.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 180.03703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8564913272857666, + "kl": 0.13217679783701897, + "learning_rate": 9.3e-08, + "loss": 0.0179, + "num_tokens": 2860378.0, + "reward": 6.5, + "reward_std": 2.0, + "rewards/reward_combined/mean": 6.5, + "rewards/reward_combined/std": 2.0, + "step": 9722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 180.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014074375852942467, + "kl": 0.1599755361676216, + "learning_rate": 9.266666666666668e-08, + "loss": 0.008, + "num_tokens": 2860688.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 180.07407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.516552448272705, + "kl": 0.11829948239028454, + "learning_rate": 9.233333333333334e-08, + "loss": 0.1751, + "num_tokens": 2860980.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 9724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 35.75, + "completions/mean_terminated_length": 35.75, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 180.09259259259258, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.924603819847107, + "kl": 0.1877682562917471, + "learning_rate": 9.2e-08, + "loss": 0.0097, + "num_tokens": 2861351.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 9725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 180.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030710916966199875, + "kl": 0.0040566254465375096, + "learning_rate": 9.166666666666667e-08, + "loss": 0.0002, + "num_tokens": 2861611.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 180.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0009462040616199374, + "kl": 4.547089338302612e-05, + "learning_rate": 9.133333333333333e-08, + "loss": 0.0, + "num_tokens": 2861823.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 180.14814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.332174301147461, + "kl": 0.1630617007613182, + "learning_rate": 9.100000000000001e-08, + "loss": 0.1538, + "num_tokens": 2862079.0, + "reward": 2.0, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.0, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 9728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 180.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03276969864964485, + "kl": 0.009042461810167879, + "learning_rate": 9.066666666666667e-08, + "loss": 0.0005, + "num_tokens": 2862401.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 180.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017010599374771118, + "kl": 0.012413645163178444, + "learning_rate": 9.033333333333333e-08, + "loss": 0.0006, + "num_tokens": 2862715.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 180.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15554964542388916, + "kl": 0.021025316091254354, + "learning_rate": 9e-08, + "loss": 0.0013, + "num_tokens": 2862980.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 180.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01840536668896675, + "kl": 0.0016925626550801098, + "learning_rate": 8.966666666666666e-08, + "loss": 0.0001, + "num_tokens": 2863252.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 180.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017756319139152765, + "kl": 0.0035761669278144836, + "learning_rate": 8.933333333333334e-08, + "loss": 0.0002, + "num_tokens": 2863488.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 180.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026851674541831017, + "kl": 0.0009266337146982551, + "learning_rate": 8.9e-08, + "loss": 0.0001, + "num_tokens": 2863704.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 180.27777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.642868518829346, + "kl": 0.00667224545031786, + "learning_rate": 8.866666666666668e-08, + "loss": 0.0905, + "num_tokens": 2863984.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 9735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 180.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02941243164241314, + "kl": 0.01439197943545878, + "learning_rate": 8.833333333333333e-08, + "loss": 0.0008, + "num_tokens": 2864276.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 180.3148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.903420448303223, + "kl": 0.33060973044484854, + "learning_rate": 8.800000000000001e-08, + "loss": 0.1991, + "num_tokens": 2864582.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 9737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 7.75, + "completions/mean_terminated_length": 7.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 180.33333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.891350746154785, + "kl": 0.04278185838484205, + "learning_rate": 8.766666666666667e-08, + "loss": 0.1731, + "num_tokens": 2864821.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 9738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 180.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06596143543720245, + "kl": 0.028216956183314323, + "learning_rate": 8.733333333333333e-08, + "loss": 0.0014, + "num_tokens": 2865143.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 180.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004152251407504082, + "kl": 0.0014495551586151123, + "learning_rate": 8.700000000000001e-08, + "loss": 0.0001, + "num_tokens": 2865359.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 180.38888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.743297100067139, + "kl": 0.019787953235208988, + "learning_rate": 8.666666666666666e-08, + "loss": 0.1181, + "num_tokens": 2865668.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 9741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 180.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0577881820499897, + "kl": 0.003214998869225383, + "learning_rate": 8.633333333333334e-08, + "loss": 0.0002, + "num_tokens": 2865966.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 180.42592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.420711517333984, + "kl": 0.024049285799264908, + "learning_rate": 8.6e-08, + "loss": 0.0858, + "num_tokens": 2866245.0, + "reward": 5.875, + "reward_std": 3.25, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 3.25, + "step": 9743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 180.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03405766934156418, + "kl": 0.00945271854288876, + "learning_rate": 8.566666666666667e-08, + "loss": 0.0004, + "num_tokens": 2866531.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 180.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04872254282236099, + "kl": 0.019486029166728258, + "learning_rate": 8.533333333333334e-08, + "loss": 0.001, + "num_tokens": 2866831.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 180.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00877660047262907, + "kl": 0.0005549965426325798, + "learning_rate": 8.5e-08, + "loss": 0.0, + "num_tokens": 2867113.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 89.25, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 89.25, + "completions/mean_terminated_length": 33.66666793823242, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 180.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.464437484741211, + "kl": 0.039972422644495964, + "learning_rate": 8.466666666666667e-08, + "loss": 0.3621, + "num_tokens": 2867694.0, + "reward": 2.799999952316284, + "reward_std": 5.770037651062012, + "rewards/reward_combined/mean": 2.799999952316284, + "rewards/reward_combined/std": 5.770037651062012, + "step": 9747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 180.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04211128503084183, + "kl": 0.010945403948426247, + "learning_rate": 8.433333333333333e-08, + "loss": 0.0005, + "num_tokens": 2867985.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 180.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04447166994214058, + "kl": 0.004647494293749332, + "learning_rate": 8.4e-08, + "loss": 0.0002, + "num_tokens": 2868275.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 180.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02404971234500408, + "kl": 0.09680220484733582, + "learning_rate": 8.366666666666667e-08, + "loss": 0.0048, + "num_tokens": 2868647.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 180.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11517985910177231, + "kl": 0.0034321162675041705, + "learning_rate": 8.333333333333333e-08, + "loss": 0.0002, + "num_tokens": 2868973.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 180.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05838199704885483, + "kl": 0.0018817521631717682, + "learning_rate": 8.3e-08, + "loss": 0.0001, + "num_tokens": 2869217.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 180.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005811711773276329, + "kl": 0.00024369855236727744, + "learning_rate": 8.266666666666667e-08, + "loss": 0.0, + "num_tokens": 2869489.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 180.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01925094798207283, + "kl": 0.002049759670626372, + "learning_rate": 8.233333333333334e-08, + "loss": 0.0001, + "num_tokens": 2869759.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 180.64814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0649664402008057, + "kl": 0.8399522602558136, + "learning_rate": 8.2e-08, + "loss": 0.0682, + "num_tokens": 2870065.0, + "reward": 4.5, + "reward_std": 4.041451930999756, + "rewards/reward_combined/mean": 4.5, + "rewards/reward_combined/std": 4.041451930999756, + "step": 9755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.5, + "completions/mean_terminated_length": 4.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 180.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013387484475970268, + "kl": 0.0025460347533226013, + "learning_rate": 8.166666666666667e-08, + "loss": 0.0001, + "num_tokens": 2870283.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 180.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028486497700214386, + "kl": 0.004458521492779255, + "learning_rate": 8.133333333333333e-08, + "loss": 0.0002, + "num_tokens": 2870613.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 180.7037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.02511739730835, + "kl": 0.030264260014519095, + "learning_rate": 8.1e-08, + "loss": 0.0892, + "num_tokens": 2870951.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 9758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 38.25, + "completions/mean_terminated_length": 38.25, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 180.72222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6431829929351807, + "kl": 0.38218337297439575, + "learning_rate": 8.066666666666667e-08, + "loss": 0.061, + "num_tokens": 2871320.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 9759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 180.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02280070260167122, + "kl": 0.004805322969332337, + "learning_rate": 8.033333333333334e-08, + "loss": 0.0002, + "num_tokens": 2871602.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 180.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0015851635253056884, + "kl": 0.00014714333519805223, + "learning_rate": 8e-08, + "loss": 0.0, + "num_tokens": 2871916.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 180.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06560209393501282, + "kl": 0.010839097434654832, + "learning_rate": 7.966666666666667e-08, + "loss": 0.0005, + "num_tokens": 2872216.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 180.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011354167014360428, + "kl": 0.0001378953456878662, + "learning_rate": 7.933333333333333e-08, + "loss": 0.0, + "num_tokens": 2872472.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 33.75, + "completions/mean_terminated_length": 33.75, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 180.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07772146910429001, + "kl": 0.04510589502751827, + "learning_rate": 7.9e-08, + "loss": 0.0021, + "num_tokens": 2872887.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 180.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04635665565729141, + "kl": 0.03606859967112541, + "learning_rate": 7.866666666666667e-08, + "loss": 0.0018, + "num_tokens": 2873187.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 180.85185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.049996376037598, + "kl": 0.7344900369644165, + "learning_rate": 7.833333333333335e-08, + "loss": -0.0052, + "num_tokens": 2873529.0, + "reward": 4.125, + "reward_std": 4.308422088623047, + "rewards/reward_combined/mean": 4.125, + "rewards/reward_combined/std": 4.308422088623047, + "step": 9766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 180.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048285603523254395, + "kl": 0.006073974538594484, + "learning_rate": 7.8e-08, + "loss": 0.0003, + "num_tokens": 2873797.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 180.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042340707033872604, + "kl": 0.0013536736369132996, + "learning_rate": 7.766666666666666e-08, + "loss": 0.0001, + "num_tokens": 2874057.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 180.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.848901987017598e-06, + "kl": 1.996755599975586e-06, + "learning_rate": 7.733333333333334e-08, + "loss": 0.0, + "num_tokens": 2874277.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 180.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03038335032761097, + "kl": 0.007244020933285356, + "learning_rate": 7.7e-08, + "loss": 0.0003, + "num_tokens": 2874630.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 180.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020245898514986038, + "kl": 0.00036829710006713867, + "learning_rate": 7.666666666666668e-08, + "loss": 0.0, + "num_tokens": 2874834.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 180.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01662726141512394, + "kl": 0.0015327015426009893, + "learning_rate": 7.633333333333333e-08, + "loss": 0.0001, + "num_tokens": 2875116.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 180.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1702125370502472, + "kl": 0.01502043369691819, + "learning_rate": 7.6e-08, + "loss": 0.0007, + "num_tokens": 2875384.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.25, + "completions/mean_terminated_length": 9.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 181.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.9899954795837402, + "kl": 0.3369261724874377, + "learning_rate": 7.566666666666667e-08, + "loss": 0.0177, + "num_tokens": 2875645.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 181.0185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.43036413192749, + "kl": 0.10908255726099014, + "learning_rate": 7.533333333333333e-08, + "loss": 0.0582, + "num_tokens": 2875891.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 9775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 181.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03449322283267975, + "kl": 0.0026401603827252984, + "learning_rate": 7.500000000000001e-08, + "loss": 0.0001, + "num_tokens": 2876192.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 181.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0019202911062166095, + "kl": 0.0035481080412864685, + "learning_rate": 7.466666666666666e-08, + "loss": 0.0002, + "num_tokens": 2876428.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 181.07407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.282996654510498, + "kl": 0.028393109212629497, + "learning_rate": 7.433333333333334e-08, + "loss": 0.1089, + "num_tokens": 2876721.0, + "reward": 6.75, + "reward_std": 2.5, + "rewards/reward_combined/mean": 6.75, + "rewards/reward_combined/std": 2.5, + "step": 9778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 181.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08331066370010376, + "kl": 0.01894476218149066, + "learning_rate": 7.4e-08, + "loss": 0.0011, + "num_tokens": 2877005.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 181.11111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7062828540802, + "kl": 0.04134655185043812, + "learning_rate": 7.366666666666666e-08, + "loss": 0.0718, + "num_tokens": 2877329.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 9780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 31.0, + "completions/mean_terminated_length": 31.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 181.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049431804567575455, + "kl": 0.032853782176971436, + "learning_rate": 7.333333333333334e-08, + "loss": 0.0016, + "num_tokens": 2877677.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 181.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035436488687992096, + "kl": 0.004036555823404342, + "learning_rate": 7.299999999999999e-08, + "loss": 0.0002, + "num_tokens": 2877935.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 181.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2851065397262573, + "kl": 0.024580217897892, + "learning_rate": 7.266666666666667e-08, + "loss": 0.0015, + "num_tokens": 2878197.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 181.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027735114097595215, + "kl": 0.0002457946538925171, + "learning_rate": 7.233333333333333e-08, + "loss": 0.0, + "num_tokens": 2878409.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.5, + "completions/mean_terminated_length": 10.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 181.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029226010665297508, + "kl": 0.0027967566275037825, + "learning_rate": 7.200000000000001e-08, + "loss": 0.0001, + "num_tokens": 2878675.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 181.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005747013725340366, + "kl": 0.16158832609653473, + "learning_rate": 7.166666666666667e-08, + "loss": 0.0081, + "num_tokens": 2878984.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 181.24074074074073, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3371148109436035, + "kl": 0.3379513509571552, + "learning_rate": 7.133333333333334e-08, + "loss": 0.0063, + "num_tokens": 2879324.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 9787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 181.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.05634636990726e-05, + "kl": 1.7583370208740234e-06, + "learning_rate": 7.1e-08, + "loss": 0.0, + "num_tokens": 2879544.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.011363636702299118, + "clip_ratio/low_min": 0.011363636702299118, + "clip_ratio/region_mean": 0.011363636702299118, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 181.27777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3449803590774536, + "kl": 0.3724980056285858, + "learning_rate": 7.066666666666666e-08, + "loss": 0.1351, + "num_tokens": 2879860.0, + "reward": 5.75, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 2.598076105117798, + "step": 9789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 181.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1281738132238388, + "kl": 0.006687208544462919, + "learning_rate": 7.033333333333334e-08, + "loss": 0.0004, + "num_tokens": 2880087.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 181.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0003145444788970053, + "kl": 0.0012390486081130803, + "learning_rate": 7e-08, + "loss": 0.0001, + "num_tokens": 2880367.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 37.5, + "completions/mean_terminated_length": 37.5, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 181.33333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9833400249481201, + "kl": 0.25872868299484253, + "learning_rate": 6.966666666666667e-08, + "loss": 0.0064, + "num_tokens": 2880733.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 9792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 181.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0057641384191811085, + "kl": 7.436275336658582e-05, + "learning_rate": 6.933333333333333e-08, + "loss": 0.0, + "num_tokens": 2880989.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 181.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007799781858921051, + "kl": 0.0011353492736816406, + "learning_rate": 6.9e-08, + "loss": 0.0001, + "num_tokens": 2881249.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 181.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052825137972831726, + "kl": 0.0035371724516153336, + "learning_rate": 6.866666666666667e-08, + "loss": 0.0002, + "num_tokens": 2881549.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 181.40740740740742, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.983455657958984, + "kl": 0.12283117696642876, + "learning_rate": 6.833333333333334e-08, + "loss": -0.0039, + "num_tokens": 2881873.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 9796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 181.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013385354541242123, + "kl": 0.0004972443566657603, + "learning_rate": 6.8e-08, + "loss": 0.0, + "num_tokens": 2882191.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 181.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04007229208946228, + "kl": 0.09807289391756058, + "learning_rate": 6.766666666666666e-08, + "loss": 0.0049, + "num_tokens": 2882563.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 181.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04402105510234833, + "kl": 0.009249531663954258, + "learning_rate": 6.733333333333333e-08, + "loss": 0.0005, + "num_tokens": 2882852.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 181.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05748549476265907, + "kl": 0.00761270709335804, + "learning_rate": 6.7e-08, + "loss": 0.0004, + "num_tokens": 2883142.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 181.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044741880148649216, + "kl": 0.007926232647150755, + "learning_rate": 6.666666666666667e-08, + "loss": 0.0004, + "num_tokens": 2883488.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 181.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025197431445121765, + "kl": 0.005192175507545471, + "learning_rate": 6.633333333333334e-08, + "loss": 0.0003, + "num_tokens": 2883756.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 181.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01790686324238777, + "kl": 0.0023043788969516754, + "learning_rate": 6.6e-08, + "loss": 0.0001, + "num_tokens": 2884068.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 181.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003130113473162055, + "kl": 0.0002112908405251801, + "learning_rate": 6.566666666666667e-08, + "loss": 0.0, + "num_tokens": 2884377.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 30.5, + "completions/mean_terminated_length": 30.5, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 181.57407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9966764450073242, + "kl": 0.10453439690172672, + "learning_rate": 6.533333333333334e-08, + "loss": -0.0288, + "num_tokens": 2884779.0, + "reward": 2.375, + "reward_std": 1.25, + "rewards/reward_combined/mean": 2.375, + "rewards/reward_combined/std": 1.25, + "step": 9805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 181.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026720933616161346, + "kl": 0.0009430637292098254, + "learning_rate": 6.5e-08, + "loss": 0.0, + "num_tokens": 2885049.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 181.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017182189971208572, + "kl": 0.0002554208040237427, + "learning_rate": 6.466666666666668e-08, + "loss": 0.0, + "num_tokens": 2885261.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 181.62962962962962, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1586709022521973, + "kl": 0.1482773907482624, + "learning_rate": 6.433333333333333e-08, + "loss": -0.0462, + "num_tokens": 2885534.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 9808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 181.64814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.106722593307495, + "kl": 0.03738443832844496, + "learning_rate": 6.4e-08, + "loss": 0.0009, + "num_tokens": 2885834.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 9809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 181.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21426236629486084, + "kl": 0.09430229105055332, + "learning_rate": 6.366666666666667e-08, + "loss": 0.0047, + "num_tokens": 2886172.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 181.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051380306482315063, + "kl": 0.011238531209528446, + "learning_rate": 6.333333333333333e-08, + "loss": 0.0006, + "num_tokens": 2886500.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.5, + "completions/mean_terminated_length": 6.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 181.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027584191411733627, + "kl": 0.0011591293732635677, + "learning_rate": 6.300000000000001e-08, + "loss": 0.0001, + "num_tokens": 2886734.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 181.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049617331475019455, + "kl": 0.013029744965024292, + "learning_rate": 6.266666666666666e-08, + "loss": 0.0007, + "num_tokens": 2887020.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 3.0, + "completions/mean_terminated_length": 3.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 181.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06499826908111572, + "kl": 0.0023485174169763923, + "learning_rate": 6.233333333333334e-08, + "loss": 0.0001, + "num_tokens": 2887236.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 181.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09056086838245392, + "kl": 0.005128631251864135, + "learning_rate": 6.2e-08, + "loss": 0.0003, + "num_tokens": 2887508.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 181.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12840566039085388, + "kl": 0.01657271245494485, + "learning_rate": 6.166666666666668e-08, + "loss": 0.0008, + "num_tokens": 2887799.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 181.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16025394201278687, + "kl": 0.02720168326050043, + "learning_rate": 6.133333333333334e-08, + "loss": 0.0015, + "num_tokens": 2888095.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 24.75, + "completions/mean_terminated_length": 24.75, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 181.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19129562377929688, + "kl": 0.03511792724020779, + "learning_rate": 6.099999999999999e-08, + "loss": 0.0021, + "num_tokens": 2888438.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 181.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05932667851448059, + "kl": 0.012376388534903526, + "learning_rate": 6.066666666666667e-08, + "loss": 0.0006, + "num_tokens": 2888767.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 181.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0012764434795826674, + "kl": 0.00033229589462280273, + "learning_rate": 6.033333333333333e-08, + "loss": 0.0, + "num_tokens": 2889027.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 35.0, + "completions/mean_terminated_length": 35.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 181.87037037037038, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.832655429840088, + "kl": 0.14973975904285908, + "learning_rate": 6.000000000000001e-08, + "loss": -0.0326, + "num_tokens": 2889395.0, + "reward": 7.875, + "reward_std": 0.25, + "rewards/reward_combined/mean": 7.875, + "rewards/reward_combined/std": 0.25, + "step": 9821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 181.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03700793534517288, + "kl": 0.006786672165617347, + "learning_rate": 5.966666666666667e-08, + "loss": 0.0003, + "num_tokens": 2889673.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.0, + "completions/max_terminated_length": 44.0, + "completions/mean_length": 25.25, + "completions/mean_terminated_length": 25.25, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 181.90740740740742, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8758647441864014, + "kl": 0.03196819685399532, + "learning_rate": 5.933333333333333e-08, + "loss": 0.2018, + "num_tokens": 2889998.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 9823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 10.75, + "completions/mean_terminated_length": 10.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 181.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07538042217493057, + "kl": 0.0030323906103149056, + "learning_rate": 5.9e-08, + "loss": 0.0002, + "num_tokens": 2890269.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 181.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022711871191859245, + "kl": 0.0019327686168253422, + "learning_rate": 5.866666666666667e-08, + "loss": 0.0001, + "num_tokens": 2890565.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 181.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023565754294395447, + "kl": 0.0006713151960866526, + "learning_rate": 5.833333333333333e-08, + "loss": 0.0, + "num_tokens": 2890784.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 181.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07558811455965042, + "kl": 0.0033678172621876, + "learning_rate": 5.8e-08, + "loss": 0.0001, + "num_tokens": 2891054.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 182.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01791425794363022, + "kl": 0.012602954637259245, + "learning_rate": 5.7666666666666673e-08, + "loss": 0.0006, + "num_tokens": 2891314.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 182.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027833988890051842, + "kl": 0.000940871424973011, + "learning_rate": 5.733333333333333e-08, + "loss": 0.0, + "num_tokens": 2891549.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 182.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04389351233839989, + "kl": 0.009435498155653477, + "learning_rate": 5.7e-08, + "loss": 0.0005, + "num_tokens": 2891838.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 4.75, + "completions/mean_terminated_length": 4.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 182.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050671618431806564, + "kl": 0.0014690712559968233, + "learning_rate": 5.666666666666667e-08, + "loss": 0.0001, + "num_tokens": 2892057.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 182.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09843046963214874, + "kl": 0.02210741490125656, + "learning_rate": 5.6333333333333335e-08, + "loss": 0.0011, + "num_tokens": 2892358.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 182.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046263426542282104, + "kl": 0.0021884179150220007, + "learning_rate": 5.6000000000000005e-08, + "loss": 0.0001, + "num_tokens": 2892656.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 182.11111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019380565732717514, + "kl": 0.0006750524044036865, + "learning_rate": 5.566666666666666e-08, + "loss": 0.0, + "num_tokens": 2892868.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 182.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03707139194011688, + "kl": 0.0010180601675529033, + "learning_rate": 5.533333333333333e-08, + "loss": 0.0001, + "num_tokens": 2893140.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 182.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0200099665671587, + "kl": 0.0006193131339387037, + "learning_rate": 5.5e-08, + "loss": 0.0, + "num_tokens": 2893396.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 182.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7514722943305969, + "kl": 0.09609995130449533, + "learning_rate": 5.466666666666667e-08, + "loss": 0.005, + "num_tokens": 2893740.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 26.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 182.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2288810759782791, + "kl": 0.056039443239569664, + "learning_rate": 5.433333333333334e-08, + "loss": 0.003, + "num_tokens": 2894098.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 43.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 43.0, + "completions/mean_terminated_length": 43.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 182.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13364006578922272, + "kl": 0.01917952485382557, + "learning_rate": 5.4e-08, + "loss": 0.0009, + "num_tokens": 2894490.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 182.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017981266602873802, + "kl": 0.04103469289839268, + "learning_rate": 5.3666666666666664e-08, + "loss": 0.002, + "num_tokens": 2894895.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 182.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005609320942312479, + "kl": 0.0006834566593170166, + "learning_rate": 5.3333333333333334e-08, + "loss": 0.0, + "num_tokens": 2895179.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 182.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017934110015630722, + "kl": 0.0007517827034462243, + "learning_rate": 5.3000000000000005e-08, + "loss": 0.0, + "num_tokens": 2895501.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 182.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2706718444824219, + "kl": 0.02328595519065857, + "learning_rate": 5.2666666666666675e-08, + "loss": 0.0012, + "num_tokens": 2895717.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.75, + "completions/mean_terminated_length": 12.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 182.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16941288113594055, + "kl": 0.012295078253373504, + "learning_rate": 5.233333333333333e-08, + "loss": 0.0006, + "num_tokens": 2895980.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 21.25, + "completions/mean_terminated_length": 21.25, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 182.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06400128453969955, + "kl": 0.0343914981931448, + "learning_rate": 5.1999999999999996e-08, + "loss": 0.0017, + "num_tokens": 2896293.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 182.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07269112765789032, + "kl": 0.0035394877195358276, + "learning_rate": 5.1666666666666666e-08, + "loss": 0.0002, + "num_tokens": 2896553.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 182.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020313585177063942, + "kl": 0.0014387592382263392, + "learning_rate": 5.1333333333333336e-08, + "loss": 0.0001, + "num_tokens": 2896831.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 13.0, + "completions/mean_terminated_length": 13.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 182.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010828782804310322, + "kl": 0.007247310597449541, + "learning_rate": 5.100000000000001e-08, + "loss": 0.0004, + "num_tokens": 2897103.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 9.75, + "completions/mean_terminated_length": 9.75, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 182.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.221320867538452, + "kl": 0.32922819582745433, + "learning_rate": 5.0666666666666664e-08, + "loss": 0.0187, + "num_tokens": 2897366.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 182.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0028008262161165476, + "kl": 0.00014487239241134375, + "learning_rate": 5.0333333333333334e-08, + "loss": 0.0, + "num_tokens": 2897678.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 182.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.9942631721496582, + "kl": 0.2854239344596863, + "learning_rate": 5e-08, + "loss": 0.0144, + "num_tokens": 2897988.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 182.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08285778760910034, + "kl": 0.024885154329240322, + "learning_rate": 4.966666666666667e-08, + "loss": 0.0013, + "num_tokens": 2898276.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 39.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 39.5, + "completions/mean_terminated_length": 39.5, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 182.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12954916059970856, + "kl": 0.07800208777189255, + "learning_rate": 4.933333333333334e-08, + "loss": 0.0039, + "num_tokens": 2898662.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 182.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038082655519247055, + "kl": 0.0021064550091978163, + "learning_rate": 4.8999999999999995e-08, + "loss": 0.0001, + "num_tokens": 2898916.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 182.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2266039848327637, + "kl": 0.005796613288111985, + "learning_rate": 4.8666666666666666e-08, + "loss": -0.0142, + "num_tokens": 2899208.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 9855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 182.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02271237038075924, + "kl": 0.0020514721982181072, + "learning_rate": 4.8333333333333336e-08, + "loss": 0.0001, + "num_tokens": 2899485.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 46.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 46.25, + "completions/mean_terminated_length": 46.25, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 182.53703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9496564865112305, + "kl": 0.07561144791543484, + "learning_rate": 4.8e-08, + "loss": 0.1046, + "num_tokens": 2899894.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 9857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 182.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07119078934192657, + "kl": 0.009405347518622875, + "learning_rate": 4.766666666666667e-08, + "loss": 0.0005, + "num_tokens": 2900163.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9858 + }, + { + "clip_ratio/high_max": 0.00909090880304575, + "clip_ratio/high_mean": 0.00909090880304575, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00909090880304575, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 182.57407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.135387420654297, + "kl": 0.06301749683916569, + "learning_rate": 4.733333333333334e-08, + "loss": -0.0035, + "num_tokens": 2900487.0, + "reward": 2.5, + "reward_std": 1.7320507764816284, + "rewards/reward_combined/mean": 2.5, + "rewards/reward_combined/std": 1.7320507764816284, + "step": 9859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 182.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04674152284860611, + "kl": 0.009944313438609242, + "learning_rate": 4.7e-08, + "loss": 0.0005, + "num_tokens": 2900785.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 182.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.004546642303467, + "kl": 0.05565602611750364, + "learning_rate": 4.666666666666667e-08, + "loss": 0.251, + "num_tokens": 2901072.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 9861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 182.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04640190303325653, + "kl": 0.0042971475049853325, + "learning_rate": 4.633333333333334e-08, + "loss": 0.0002, + "num_tokens": 2901372.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 182.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00401738565415144, + "kl": 0.00024271011352539062, + "learning_rate": 4.6e-08, + "loss": 0.0, + "num_tokens": 2901616.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 182.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005986438598483801, + "kl": 0.003446533199166879, + "learning_rate": 4.5666666666666665e-08, + "loss": 0.0002, + "num_tokens": 2901874.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 182.6851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054798975586891174, + "kl": 0.013443166855722666, + "learning_rate": 4.5333333333333336e-08, + "loss": 0.0007, + "num_tokens": 2902201.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 182.7037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.616187572479248, + "kl": 0.1407563267275691, + "learning_rate": 4.5e-08, + "loss": -0.1066, + "num_tokens": 2902499.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 9866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 182.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07756587862968445, + "kl": 0.001069672405719757, + "learning_rate": 4.466666666666667e-08, + "loss": 0.0001, + "num_tokens": 2902711.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 182.74074074074073, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.326424598693848, + "kl": 0.01726543391123414, + "learning_rate": 4.433333333333334e-08, + "loss": 0.0007, + "num_tokens": 2902983.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 9868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 182.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11409106105566025, + "kl": 0.01539933169260621, + "learning_rate": 4.4000000000000004e-08, + "loss": 0.0008, + "num_tokens": 2903307.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 182.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.8136760849738494e-05, + "kl": 1.691281795501709e-06, + "learning_rate": 4.366666666666667e-08, + "loss": 0.0, + "num_tokens": 2903527.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 36.25, + "completions/mean_terminated_length": 36.25, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 182.7962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0188217163085938, + "kl": 0.0944785475730896, + "learning_rate": 4.333333333333333e-08, + "loss": -0.0308, + "num_tokens": 2903888.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 9871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 25.5, + "completions/mean_terminated_length": 25.5, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 182.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01453560683876276, + "kl": 0.07380939088761806, + "learning_rate": 4.3e-08, + "loss": 0.0037, + "num_tokens": 2904258.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 182.83333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7986859083175659, + "kl": 0.5325427949428558, + "learning_rate": 4.266666666666667e-08, + "loss": 0.04, + "num_tokens": 2904563.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 9873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 182.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05519038438796997, + "kl": 0.00658091323566623, + "learning_rate": 4.2333333333333335e-08, + "loss": 0.0002, + "num_tokens": 2904879.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 27.0, + "completions/mean_terminated_length": 27.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 182.87037037037038, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9498788714408875, + "kl": 0.05868354067206383, + "learning_rate": 4.2e-08, + "loss": 0.0032, + "num_tokens": 2905223.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 9875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 182.88888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.024899005889893, + "kl": 0.04051691293716431, + "learning_rate": 4.166666666666666e-08, + "loss": 0.0039, + "num_tokens": 2905525.0, + "reward": 5.25, + "reward_std": 2.598076105117798, + "rewards/reward_combined/mean": 5.25, + "rewards/reward_combined/std": 2.598076105117798, + "step": 9876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 182.90740740740742, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4432549476623535, + "kl": 0.016404787078499794, + "learning_rate": 4.133333333333333e-08, + "loss": 0.0299, + "num_tokens": 2905839.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 9877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 182.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026123646646738052, + "kl": 0.012449371162801981, + "learning_rate": 4.1e-08, + "loss": 0.0007, + "num_tokens": 2906111.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 182.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001900079776532948, + "kl": 0.003551207482814789, + "learning_rate": 4.066666666666667e-08, + "loss": 0.0002, + "num_tokens": 2906347.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 182.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014301084913313389, + "kl": 0.0002517402172088623, + "learning_rate": 4.033333333333334e-08, + "loss": 0.0, + "num_tokens": 2906551.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 32.25, + "completions/mean_terminated_length": 32.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 182.9814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8679953813552856, + "kl": 0.01237863814458251, + "learning_rate": 4e-08, + "loss": -0.0753, + "num_tokens": 2906908.0, + "reward": 5.125, + "reward_std": 3.4731109142303467, + "rewards/reward_combined/mean": 5.125, + "rewards/reward_combined/std": 3.4731109142303467, + "step": 9881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 183.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11042208224534988, + "kl": 0.012875689659267664, + "learning_rate": 3.9666666666666665e-08, + "loss": 0.0007, + "num_tokens": 2907198.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 31.25, + "completions/mean_terminated_length": 31.25, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 183.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059440977871418, + "kl": 0.03435606695711613, + "learning_rate": 3.9333333333333335e-08, + "loss": 0.0017, + "num_tokens": 2907603.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 183.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00046914175618439913, + "kl": 1.689046621322632e-05, + "learning_rate": 3.9e-08, + "loss": 0.0, + "num_tokens": 2907815.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 183.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04074396565556526, + "kl": 0.002172502805478871, + "learning_rate": 3.866666666666667e-08, + "loss": 0.0001, + "num_tokens": 2908075.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 6.0, + "completions/mean_terminated_length": 6.0, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 183.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026256661862134933, + "kl": 0.0010736336407717317, + "learning_rate": 3.833333333333334e-08, + "loss": 0.0001, + "num_tokens": 2908307.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 183.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038920480757951736, + "kl": 0.002573552686953917, + "learning_rate": 3.8e-08, + "loss": 0.0001, + "num_tokens": 2908567.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 29.25, + "completions/mean_terminated_length": 29.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 183.11111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.539245128631592, + "kl": 0.0691833607852459, + "learning_rate": 3.7666666666666666e-08, + "loss": 0.1144, + "num_tokens": 2908920.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 9888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 183.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044224534183740616, + "kl": 0.01291979430243373, + "learning_rate": 3.733333333333333e-08, + "loss": 0.0007, + "num_tokens": 2909194.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 183.14814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.609105110168457, + "kl": 0.07271228171885014, + "learning_rate": 3.7e-08, + "loss": 0.0065, + "num_tokens": 2909498.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 9890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 28.75, + "completions/mean_terminated_length": 28.75, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 183.16666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3273508548736572, + "kl": 0.01727142045274377, + "learning_rate": 3.666666666666667e-08, + "loss": 0.1236, + "num_tokens": 2909841.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 9891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 183.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03298269957304001, + "kl": 0.008421921404078603, + "learning_rate": 3.6333333333333334e-08, + "loss": 0.0004, + "num_tokens": 2910169.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 183.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.074802465736866, + "kl": 0.027217524126172066, + "learning_rate": 3.6000000000000005e-08, + "loss": 0.0013, + "num_tokens": 2910515.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 3.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.5, + "completions/mean_terminated_length": 3.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 183.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03195382282137871, + "kl": 0.000666402280330658, + "learning_rate": 3.566666666666667e-08, + "loss": 0.0, + "num_tokens": 2910725.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 183.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08096520602703094, + "kl": 0.020325182005763054, + "learning_rate": 3.533333333333333e-08, + "loss": 0.001, + "num_tokens": 2910997.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 183.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029553748667240143, + "kl": 0.004050072107929736, + "learning_rate": 3.5e-08, + "loss": 0.0002, + "num_tokens": 2911265.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 5.75, + "completions/mean_terminated_length": 5.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 183.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.620262622833252, + "kl": 0.5815833956003189, + "learning_rate": 3.4666666666666666e-08, + "loss": 0.0426, + "num_tokens": 2911508.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 183.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.405125299759675e-06, + "kl": 1.862645149230957e-06, + "learning_rate": 3.4333333333333336e-08, + "loss": 0.0, + "num_tokens": 2911728.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 183.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07107184827327728, + "kl": 0.006637036451138556, + "learning_rate": 3.4e-08, + "loss": 0.0003, + "num_tokens": 2912000.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 34.5, + "completions/mean_terminated_length": 34.5, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 183.33333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9404093027114868, + "kl": 0.2910590171813965, + "learning_rate": 3.3666666666666664e-08, + "loss": -0.0842, + "num_tokens": 2912366.0, + "reward": 7.75, + "reward_std": 0.28867512941360474, + "rewards/reward_combined/mean": 7.75, + "rewards/reward_combined/std": 0.28867512941360474, + "step": 9900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 183.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.4035552740097046, + "kl": 0.1563080116175115, + "learning_rate": 3.3333333333333334e-08, + "loss": 0.0085, + "num_tokens": 2912664.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 183.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016459928825497627, + "kl": 0.0008407303830608726, + "learning_rate": 3.3e-08, + "loss": 0.0, + "num_tokens": 2912946.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 183.38888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028843404725193977, + "kl": 0.00676688551902771, + "learning_rate": 3.266666666666667e-08, + "loss": 0.0003, + "num_tokens": 2913218.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 183.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01898745447397232, + "kl": 0.0006910534575581551, + "learning_rate": 3.233333333333334e-08, + "loss": 0.0, + "num_tokens": 2913530.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 33.25, + "completions/mean_terminated_length": 33.25, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 183.42592592592592, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4804060459136963, + "kl": 0.1441753190010786, + "learning_rate": 3.2e-08, + "loss": -0.014, + "num_tokens": 2913887.0, + "reward": 6.375, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.375, + "rewards/reward_combined/std": 2.25, + "step": 9905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 183.44444444444446, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.594256401062012, + "kl": 0.042867622105404735, + "learning_rate": 3.1666666666666666e-08, + "loss": 0.0058, + "num_tokens": 2914169.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 9906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 31.75, + "completions/mean_terminated_length": 31.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 183.46296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.631552696228027, + "kl": 0.03864024020731449, + "learning_rate": 3.133333333333333e-08, + "loss": -0.0338, + "num_tokens": 2914516.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 9907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 183.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07126500457525253, + "kl": 0.01517056580632925, + "learning_rate": 3.1e-08, + "loss": 0.0007, + "num_tokens": 2914819.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 183.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07337877154350281, + "kl": 0.016658049076795578, + "learning_rate": 3.066666666666667e-08, + "loss": 0.001, + "num_tokens": 2915101.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 183.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11530859768390656, + "kl": 0.032312868162989616, + "learning_rate": 3.0333333333333334e-08, + "loss": 0.0016, + "num_tokens": 2915415.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 8.25, + "completions/mean_terminated_length": 8.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 183.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6236201524734497, + "kl": 0.07196260988712311, + "learning_rate": 3.0000000000000004e-08, + "loss": 0.0035, + "num_tokens": 2915660.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 183.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07097512483596802, + "kl": 0.002381057245656848, + "learning_rate": 2.9666666666666664e-08, + "loss": 0.0001, + "num_tokens": 2915974.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 183.57407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0489969365298748, + "kl": 0.005790283539681695, + "learning_rate": 2.9333333333333335e-08, + "loss": 0.0003, + "num_tokens": 2916290.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 183.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.644816517829895, + "kl": 0.06182416994124651, + "learning_rate": 2.9e-08, + "loss": 0.0032, + "num_tokens": 2916564.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 183.61111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0034353183582425117, + "kl": 0.0003933049738407135, + "learning_rate": 2.8666666666666665e-08, + "loss": 0.0, + "num_tokens": 2916824.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 183.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.3449805974960327, + "kl": 0.4834003150463104, + "learning_rate": 2.8333333333333336e-08, + "loss": 0.0238, + "num_tokens": 2917127.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 183.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021869849413633347, + "kl": 0.0009674280881881714, + "learning_rate": 2.8000000000000003e-08, + "loss": 0.0, + "num_tokens": 2917339.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 183.66666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005327875725924969, + "kl": 0.0014873594045639038, + "learning_rate": 2.7666666666666666e-08, + "loss": 0.0001, + "num_tokens": 2917555.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 183.6851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2582266330718994, + "kl": 0.003645677206804976, + "learning_rate": 2.7333333333333337e-08, + "loss": 0.0022, + "num_tokens": 2917886.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 9919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 183.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0511690117418766, + "kl": 0.0010033586295321584, + "learning_rate": 2.7e-08, + "loss": 0.0001, + "num_tokens": 2918143.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 183.72222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03563246876001358, + "kl": 0.0030860661063343287, + "learning_rate": 2.6666666666666667e-08, + "loss": 0.0002, + "num_tokens": 2918431.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 183.74074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005469589959830046, + "kl": 0.00027230083651375026, + "learning_rate": 2.6333333333333338e-08, + "loss": 0.0, + "num_tokens": 2918691.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 183.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04079728201031685, + "kl": 0.008279956877231598, + "learning_rate": 2.5999999999999998e-08, + "loss": 0.0004, + "num_tokens": 2918981.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 11.25, + "completions/mean_terminated_length": 11.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 183.77777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9211490154266357, + "kl": 0.23908530501648784, + "learning_rate": 2.5666666666666668e-08, + "loss": -0.0115, + "num_tokens": 2919250.0, + "reward": 4.75, + "reward_std": 3.4034295082092285, + "rewards/reward_combined/mean": 4.75, + "rewards/reward_combined/std": 3.4034297466278076, + "step": 9924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.25, + "completions/mean_terminated_length": 13.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 183.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000467156118247658, + "kl": 0.0013129416620358825, + "learning_rate": 2.5333333333333332e-08, + "loss": 0.0001, + "num_tokens": 2919527.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 183.8148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01972462236881256, + "kl": 0.0071532020810991526, + "learning_rate": 2.5e-08, + "loss": 0.0004, + "num_tokens": 2919823.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 183.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018887687474489212, + "kl": 0.0007814254495315254, + "learning_rate": 2.466666666666667e-08, + "loss": 0.0, + "num_tokens": 2920103.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 26.25, + "completions/mean_terminated_length": 26.25, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 183.85185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02301059663295746, + "kl": 0.09546614065766335, + "learning_rate": 2.4333333333333333e-08, + "loss": 0.0048, + "num_tokens": 2920476.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 22.5, + "completions/mean_terminated_length": 22.5, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 183.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04622969776391983, + "kl": 0.006424385355785489, + "learning_rate": 2.4e-08, + "loss": 0.0003, + "num_tokens": 2920810.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 39.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.0, + "completions/max_terminated_length": 46.0, + "completions/mean_length": 39.5, + "completions/mean_terminated_length": 39.5, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 183.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12056931853294373, + "kl": 0.06816163286566734, + "learning_rate": 2.366666666666667e-08, + "loss": 0.0035, + "num_tokens": 2921184.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 183.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07800615578889847, + "kl": 0.03056582622230053, + "learning_rate": 2.3333333333333334e-08, + "loss": 0.0015, + "num_tokens": 2921486.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 183.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035695284605026245, + "kl": 0.0036770704900845885, + "learning_rate": 2.3e-08, + "loss": 0.0002, + "num_tokens": 2921820.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 5.75, + "completions/mean_terminated_length": 5.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 183.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21672280132770538, + "kl": 0.016340750502422452, + "learning_rate": 2.2666666666666668e-08, + "loss": 0.0011, + "num_tokens": 2922043.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 183.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037876784801483154, + "kl": 0.00466212525498122, + "learning_rate": 2.2333333333333335e-08, + "loss": 0.0002, + "num_tokens": 2922334.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 183.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008505430072546005, + "kl": 0.1598047837615013, + "learning_rate": 2.2000000000000002e-08, + "loss": 0.008, + "num_tokens": 2922644.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 184.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0425378791987896, + "kl": 0.00784134236164391, + "learning_rate": 2.1666666666666665e-08, + "loss": 0.0004, + "num_tokens": 2922931.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 184.0185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0028524266090244055, + "kl": 0.00020351459534140304, + "learning_rate": 2.1333333333333336e-08, + "loss": 0.0, + "num_tokens": 2923195.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.0, + "completions/max_terminated_length": 6.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 184.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026593690738081932, + "kl": 0.0007211466581793502, + "learning_rate": 2.1e-08, + "loss": 0.0, + "num_tokens": 2923415.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 184.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.173880243906751e-05, + "kl": 2.6598572731018066e-06, + "learning_rate": 2.0666666666666666e-08, + "loss": 0.0, + "num_tokens": 2923635.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 184.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04364466667175293, + "kl": 0.009181763249216601, + "learning_rate": 2.0333333333333333e-08, + "loss": 0.0005, + "num_tokens": 2923922.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 184.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0627545490860939, + "kl": 0.02671348676085472, + "learning_rate": 2e-08, + "loss": 0.0013, + "num_tokens": 2924268.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 184.11111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.020615577697754, + "kl": 0.030892505426891148, + "learning_rate": 1.9666666666666667e-08, + "loss": 0.2241, + "num_tokens": 2924514.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 9942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 184.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034249190241098404, + "kl": 0.0062084178207442164, + "learning_rate": 1.9333333333333334e-08, + "loss": 0.0003, + "num_tokens": 2924818.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 184.14814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.559693336486816, + "kl": 0.06395634077489376, + "learning_rate": 1.9e-08, + "loss": 0.0509, + "num_tokens": 2925093.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 9944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 184.16666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0060663470067083836, + "kl": 0.00030325717671075836, + "learning_rate": 1.8666666666666665e-08, + "loss": 0.0, + "num_tokens": 2925404.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 184.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20037707686424255, + "kl": 0.01274197647580877, + "learning_rate": 1.8333333333333335e-08, + "loss": 0.0007, + "num_tokens": 2925730.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 184.2037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004123511258512735, + "kl": 0.00024154037237167358, + "learning_rate": 1.8000000000000002e-08, + "loss": 0.0, + "num_tokens": 2925974.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 184.22222222222223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03306548669934273, + "kl": 0.009320731740444899, + "learning_rate": 1.7666666666666666e-08, + "loss": 0.0005, + "num_tokens": 2926297.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 184.24074074074073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04611361399292946, + "kl": 0.002875296981073916, + "learning_rate": 1.7333333333333333e-08, + "loss": 0.0001, + "num_tokens": 2926569.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 184.25925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039583124220371246, + "kl": 0.006974290125072002, + "learning_rate": 1.7e-08, + "loss": 0.0003, + "num_tokens": 2926860.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 26.0, + "completions/mean_terminated_length": 26.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 184.27777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023145070299506187, + "kl": 0.09695570915937424, + "learning_rate": 1.6666666666666667e-08, + "loss": 0.0048, + "num_tokens": 2927232.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 10.0, + "completions/mean_terminated_length": 10.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 184.2962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04906821250915527, + "kl": 0.0006284117553150281, + "learning_rate": 1.6333333333333334e-08, + "loss": 0.0, + "num_tokens": 2927488.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 184.3148148148148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09616069495677948, + "kl": 0.01611461688298732, + "learning_rate": 1.6e-08, + "loss": 0.001, + "num_tokens": 2927807.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 26.75, + "completions/mean_terminated_length": 26.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 184.33333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1833784133195877, + "kl": 0.05304853431880474, + "learning_rate": 1.5666666666666665e-08, + "loss": 0.0024, + "num_tokens": 2928194.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5.0, + "completions/max_terminated_length": 5.0, + "completions/mean_length": 5.0, + "completions/mean_terminated_length": 5.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 184.35185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0033897364046424627, + "kl": 0.00013303756713867188, + "learning_rate": 1.5333333333333335e-08, + "loss": 0.0, + "num_tokens": 2928414.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 184.37037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1685088872909546, + "kl": 0.17326711118221283, + "learning_rate": 1.5000000000000002e-08, + "loss": 0.0087, + "num_tokens": 2928725.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.0, + "completions/max_terminated_length": 46.0, + "completions/mean_length": 30.25, + "completions/mean_terminated_length": 30.25, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 184.38888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5447914600372314, + "kl": 0.24420192709658295, + "learning_rate": 1.4666666666666667e-08, + "loss": -0.0272, + "num_tokens": 2929074.0, + "reward": 5.0, + "reward_std": 3.5590262413024902, + "rewards/reward_combined/mean": 5.0, + "rewards/reward_combined/std": 3.5590262413024902, + "step": 9957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 184.40740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04683195427060127, + "kl": 0.0038846245734021068, + "learning_rate": 1.4333333333333333e-08, + "loss": 0.0002, + "num_tokens": 2929346.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 184.42592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12173474580049515, + "kl": 0.02009764825925231, + "learning_rate": 1.4000000000000001e-08, + "loss": 0.0012, + "num_tokens": 2929628.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 184.44444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03806106001138687, + "kl": 0.0033851079642772675, + "learning_rate": 1.3666666666666668e-08, + "loss": 0.0002, + "num_tokens": 2929940.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 184.46296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02742687612771988, + "kl": 0.005892345157917589, + "learning_rate": 1.3333333333333334e-08, + "loss": 0.0003, + "num_tokens": 2930228.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 184.4814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0044488986022770405, + "kl": 2.4691224098205566e-05, + "learning_rate": 1.2999999999999999e-08, + "loss": 0.0, + "num_tokens": 2930440.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 32.5, + "completions/mean_terminated_length": 32.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 184.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013761185109615326, + "kl": 0.002048698952421546, + "learning_rate": 1.2666666666666666e-08, + "loss": 0.0001, + "num_tokens": 2930790.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 9.0, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 184.5185185185185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018396787345409393, + "kl": 0.012444132007658482, + "learning_rate": 1.2333333333333335e-08, + "loss": 0.0006, + "num_tokens": 2931050.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 184.53703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01640355959534645, + "kl": 0.2655911296606064, + "learning_rate": 1.2e-08, + "loss": 0.0133, + "num_tokens": 2931354.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 184.55555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0358988493680954, + "kl": 0.0027944179018959403, + "learning_rate": 1.1666666666666667e-08, + "loss": 0.0001, + "num_tokens": 2931655.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 6.75, + "completions/mean_terminated_length": 6.75, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 184.57407407407408, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.697304725646973, + "kl": 0.0037406296469271183, + "learning_rate": 1.1333333333333334e-08, + "loss": 0.3122, + "num_tokens": 2931878.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 9967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 184.59259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.062496013939380646, + "kl": 0.003379064262844622, + "learning_rate": 1.1000000000000001e-08, + "loss": 0.0002, + "num_tokens": 2932145.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 184.61111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.009119510650635, + "kl": 0.11926095932722092, + "learning_rate": 1.0666666666666668e-08, + "loss": 0.1536, + "num_tokens": 2932465.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 9969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 7.5, + "completions/mean_terminated_length": 7.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 184.62962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08952630311250687, + "kl": 0.006811510305851698, + "learning_rate": 1.0333333333333333e-08, + "loss": 0.0004, + "num_tokens": 2932695.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 184.64814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.000314667442580685, + "kl": 0.001240525976754725, + "learning_rate": 1e-08, + "loss": 0.0001, + "num_tokens": 2932975.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 37.5, + "completions/mean_terminated_length": 37.5, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 184.66666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0069992542266846, + "kl": 0.07766726985573769, + "learning_rate": 9.666666666666667e-09, + "loss": 0.0135, + "num_tokens": 2933341.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 9972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 184.6851851851852, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6369693279266357, + "kl": 0.03867245092988014, + "learning_rate": 9.333333333333333e-09, + "loss": 0.1959, + "num_tokens": 2933627.0, + "reward": 6.875, + "reward_std": 2.25, + "rewards/reward_combined/mean": 6.875, + "rewards/reward_combined/std": 2.25, + "step": 9973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 184.7037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001838079420849681, + "kl": 0.0005452297627925873, + "learning_rate": 9.000000000000001e-09, + "loss": 0.0, + "num_tokens": 2933887.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 24.75, + "completions/mean_terminated_length": 24.75, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 184.72222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.707056522369385, + "kl": 0.07085501775145531, + "learning_rate": 8.666666666666667e-09, + "loss": 0.0259, + "num_tokens": 2934206.0, + "reward": 3.25, + "reward_std": 1.5, + "rewards/reward_combined/mean": 3.25, + "rewards/reward_combined/std": 1.5, + "step": 9975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 184.74074074074073, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6058717966079712, + "kl": 0.06466163269942626, + "learning_rate": 8.333333333333334e-09, + "loss": 0.0041, + "num_tokens": 2934486.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 9976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 184.75925925925927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018751665484160185, + "kl": 0.003562338650226593, + "learning_rate": 8e-09, + "loss": 0.0002, + "num_tokens": 2934722.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 4.0, + "rewards/reward_combined/std": 0.0, + "step": 9977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 184.77777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11398318409919739, + "kl": 0.042106447741389275, + "learning_rate": 7.666666666666667e-09, + "loss": 0.0022, + "num_tokens": 2935017.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 184.7962962962963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016550902277231216, + "kl": 0.0007434528088197112, + "learning_rate": 7.333333333333334e-09, + "loss": 0.0, + "num_tokens": 2935339.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 184.8148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.5287065505981445, + "kl": 0.07356359669938684, + "learning_rate": 7.000000000000001e-09, + "loss": 0.1737, + "num_tokens": 2935657.0, + "reward": 5.875, + "reward_std": 3.25, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 3.25, + "step": 9980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 24.0, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 184.83333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02500924840569496, + "kl": 0.006159370765089989, + "learning_rate": 6.666666666666667e-09, + "loss": 0.0003, + "num_tokens": 2936005.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 36.5, + "completions/mean_terminated_length": 36.5, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 184.85185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5838184356689453, + "kl": 0.13607493788003922, + "learning_rate": 6.333333333333333e-09, + "loss": 0.0149, + "num_tokens": 2936379.0, + "reward": 6.75, + "reward_std": 2.5, + "rewards/reward_combined/mean": 6.75, + "rewards/reward_combined/std": 2.5, + "step": 9982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 184.87037037037038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08272148668766022, + "kl": 0.007379991700872779, + "learning_rate": 6e-09, + "loss": 0.0004, + "num_tokens": 2936647.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 184.88888888888889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15359057486057281, + "kl": 0.018743189051747322, + "learning_rate": 5.666666666666667e-09, + "loss": 0.0011, + "num_tokens": 2936935.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 184.90740740740742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02734927274286747, + "kl": 0.05121681094169617, + "learning_rate": 5.333333333333334e-09, + "loss": 0.0025, + "num_tokens": 2937271.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 184.92592592592592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17441505193710327, + "kl": 0.034458561800420284, + "learning_rate": 5e-09, + "loss": 0.0017, + "num_tokens": 2937567.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 12.0, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 184.94444444444446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02018146589398384, + "kl": 0.00628455744299572, + "learning_rate": 4.666666666666666e-09, + "loss": 0.0003, + "num_tokens": 2937839.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 184.96296296296296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051395975053310394, + "kl": 0.002733725297730416, + "learning_rate": 4.333333333333333e-09, + "loss": 0.0002, + "num_tokens": 2938093.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 184.9814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03161874786019325, + "kl": 0.0021529156947508454, + "learning_rate": 4e-09, + "loss": 0.0001, + "num_tokens": 2938389.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 39.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 39.75, + "completions/mean_terminated_length": 39.75, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 185.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5798838138580322, + "kl": 0.04691888391971588, + "learning_rate": 3.666666666666667e-09, + "loss": 0.0062, + "num_tokens": 2938772.0, + "reward": 5.75, + "reward_std": 3.5, + "rewards/reward_combined/mean": 5.75, + "rewards/reward_combined/std": 3.5, + "step": 9990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 10.75, + "completions/mean_terminated_length": 10.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 185.0185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.500663757324219, + "kl": 0.16840357484761626, + "learning_rate": 3.3333333333333334e-09, + "loss": 0.0838, + "num_tokens": 2939035.0, + "reward": 2.75, + "reward_std": 1.5, + "rewards/reward_combined/mean": 2.75, + "rewards/reward_combined/std": 1.5, + "step": 9991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 2.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 2.0, + "completions/mean_terminated_length": 2.0, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 185.03703703703704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018531130626797676, + "kl": 0.0008435696363449097, + "learning_rate": 3e-09, + "loss": 0.0, + "num_tokens": 2939247.0, + "reward": 3.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.5, + "rewards/reward_combined/std": 0.0, + "step": 9992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 4.0, + "completions/mean_terminated_length": 4.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 185.05555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.4365555620752275e-05, + "kl": 2.086162567138672e-06, + "learning_rate": 2.666666666666667e-09, + "loss": 0.0, + "num_tokens": 2939467.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 3.0, + "rewards/reward_combined/std": 0.0, + "step": 9993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 185.07407407407408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07230444997549057, + "kl": 0.011429775040596724, + "learning_rate": 2.333333333333333e-09, + "loss": 0.0006, + "num_tokens": 2939762.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 185.09259259259258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12053240835666656, + "kl": 0.014214991824701428, + "learning_rate": 2e-09, + "loss": 0.0007, + "num_tokens": 2940055.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 21.75, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 185.11111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8724961280822754, + "kl": 0.19114062655717134, + "learning_rate": 1.6666666666666667e-09, + "loss": 0.0543, + "num_tokens": 2940362.0, + "reward": 5.875, + "reward_std": 3.25, + "rewards/reward_combined/mean": 5.875, + "rewards/reward_combined/std": 3.25, + "step": 9996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 12.75, + "completions/mean_terminated_length": 12.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 185.12962962962962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1254260390996933, + "kl": 0.009432489052414894, + "learning_rate": 1.3333333333333335e-09, + "loss": 0.0005, + "num_tokens": 2940633.0, + "reward": 7.5, + "reward_std": 0.0, + "rewards/reward_combined/mean": 7.5, + "rewards/reward_combined/std": 0.0, + "step": 9997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 185.14814814814815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3686263859272003, + "kl": 0.029108582995831966, + "learning_rate": 1e-09, + "loss": 0.0015, + "num_tokens": 2940917.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 9998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.75, + "completions/mean_terminated_length": 11.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 185.16666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.9218740463256836, + "kl": 0.005491628777235746, + "learning_rate": 6.666666666666667e-10, + "loss": 0.0235, + "num_tokens": 2941192.0, + "reward": 6.25, + "reward_std": 3.5, + "rewards/reward_combined/mean": 6.25, + "rewards/reward_combined/std": 3.5, + "step": 9999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 185.1851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007952062413096428, + "kl": 0.0004007460083812475, + "learning_rate": 3.3333333333333337e-10, + "loss": 0.0, + "num_tokens": 2941503.0, + "reward": 8.0, + "reward_std": 0.0, + "rewards/reward_combined/mean": 8.0, + "rewards/reward_combined/std": 0.0, + "step": 10000 + } + ], + "logging_steps": 1, + "max_steps": 10000, + "num_input_tokens_seen": 2941503, + "num_train_epochs": 186, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}