diff --git a/sample_factory/algo/learning/learner.py b/sample_factory/algo/learning/learner.py index 178d2ab..20bb937 100644 --- a/sample_factory/algo/learning/learner.py +++ b/sample_factory/algo/learning/learner.py @@ -110,6 +110,20 @@ class KlAdaptiveSchedulerPerEpoch(KlAdaptiveScheduler): def invoke_after_each_epoch(self): return True +class LinearDecayScheduler(LearningRateScheduler): + def __init__(self, cfg): + num_updates = cfg.train_for_env_steps // cfg.batch_size * cfg.num_epochs + self.linear_decay = LinearDecay([(0, cfg.learning_rate), (num_updates, 0)]) + self.step = 0 + + def invoke_after_each_minibatch(self): + return True + + def update(self, current_lr, recent_kls): + self.step += 1 + lr = self.linear_decay.at(self.step) + return lr + def get_lr_scheduler(cfg) -> LearningRateScheduler: if cfg.lr_schedule == "constant": @@ -118,6 +132,8 @@ def get_lr_scheduler(cfg) -> LearningRateScheduler: return KlAdaptiveSchedulerPerMinibatch(cfg) elif cfg.lr_schedule == "kl_adaptive_epoch": return KlAdaptiveSchedulerPerEpoch(cfg) + elif cfg.lr_schedule == "linear_decay": + return LinearDecayScheduler(cfg) else: raise RuntimeError(f"Unknown scheduler {cfg.lr_schedule}") diff --git a/sample_factory/envs/mujoco/mujoco_params.py b/sample_factory/envs/mujoco/mujoco_params.py index ef0b486..cb4b977 100644 --- a/sample_factory/envs/mujoco/mujoco_params.py +++ b/sample_factory/envs/mujoco/mujoco_params.py @@ -1,117 +1,155 @@ +# def mujoco_override_defaults(env, parser): +# parser.set_defaults( +# batched_sampling=False, +# num_workers=8, +# num_envs_per_worker=16, +# worker_num_splits=2, +# train_for_env_steps=1000000, +# encoder_type="mlp", +# encoder_subtype="mlp_mujoco", +# hidden_size=64, +# encoder_extra_fc_layers=0, +# env_frameskip=1, +# nonlinearity="tanh", +# batch_size=64, +# kl_loss_coeff=0.1, +# use_rnn=False, +# adaptive_stddev=False, +# policy_initialization="torch_default", +# reward_scale=0.01, +# rollout=8, +# max_grad_norm=0.0, +# ppo_epochs=10, +# num_batches_per_epoch=32, +# ppo_clip_ratio=0.2, +# value_loss_coeff=2.0, +# exploration_loss_coeff=0.0, +# learning_rate=3e-3, +# lr_schedule="constant", +# shuffle_minibatches=True, +# gamma=0.99, +# gae_lambda=0.95, +# with_vtrace=False, +# recurrence=1, +# value_bootstrap=False, +# normalize_input=True, +# experiment_summaries_interval=3, +# save_every_sec=15, +# serial_mode=False, +# async_rl=False, +# ) + +# # environment specific overrides +# env_name = "_".join(env.split("_")[1:]).lower() + +# if env_name == "halfcheetah": +# parser.set_defaults( +# reward_scale=0.1, +# learning_rate=3e-3, +# lr_schedule="kl_adaptive_epoch", +# lr_schedule_kl_threshold=3e-2, +# normalize_input=False, +# num_batches_per_epoch=1, +# ) +# if env_name == "humanoid": +# parser.set_defaults( +# learning_rate=3e-4, +# ) +# if env_name == "hopper": +# parser.set_defaults( +# reward_scale=0.1, +# learning_rate=3e-3, +# lr_schedule="kl_adaptive_epoch", +# lr_schedule_kl_threshold=3e-2, +# # normalize_input=False, +# # num_batches_per_epoch=1, +# # normalize_returns=True, +# # hidden_size=128, +# ) +# if env_name == "doublependulum": +# parser.set_defaults( +# reward_scale=0.01, +# learning_rate=3e-3, +# lr_schedule="kl_adaptive_epoch", +# lr_schedule_kl_threshold=3e-2, +# ) +# if env_name == "pendulum": +# parser.set_defaults( +# # reward_scale=0.01, +# learning_rate=3e-4, +# lr_schedule="kl_adaptive_epoch", +# lr_schedule_kl_threshold=3e-3, +# ) +# if env_name == "reacher": +# parser.set_defaults( +# reward_scale=0.1, +# learning_rate=3e-3, +# lr_schedule="kl_adaptive_epoch", +# lr_schedule_kl_threshold=3e-2, +# normalize_input=False, +# num_batches_per_epoch=1, +# ) +# if env_name == "swimmer": +# parser.set_defaults( +# reward_scale=1, +# # learning_rate=3e-3, +# # lr_schedule="kl_adaptive_epoch", +# # lr_schedule_kl_threshold=3e-2, +# # gamma=0.9995, +# rollout=128, +# batch_size=128, +# ) +# if env_name == "walker": +# parser.set_defaults( +# reward_scale=0.1, +# learning_rate=3e-3, +# lr_schedule="kl_adaptive_epoch", +# lr_schedule_kl_threshold=3e-2, +# ) + def mujoco_override_defaults(env, parser): parser.set_defaults( batched_sampling=False, num_workers=8, - num_envs_per_worker=16, + num_envs_per_worker=8, worker_num_splits=2, - train_for_env_steps=1000000, + train_for_env_steps=10000000, encoder_type="mlp", encoder_subtype="mlp_mujoco", hidden_size=64, encoder_extra_fc_layers=0, env_frameskip=1, nonlinearity="tanh", - batch_size=64, + batch_size=1024, kl_loss_coeff=0.1, - use_rnn=False, adaptive_stddev=False, policy_initialization="torch_default", - reward_scale=0.01, - rollout=8, - max_grad_norm=0.0, - ppo_epochs=10, - num_batches_per_epoch=32, + reward_scale=1, + rollout=64, + max_grad_norm=3.5, + num_epochs=2, + num_batches_per_epoch=4, ppo_clip_ratio=0.2, - value_loss_coeff=2.0, + value_loss_coeff=1.3, exploration_loss_coeff=0.0, - learning_rate=3e-3, - lr_schedule="constant", - shuffle_minibatches=True, + learning_rate=0.00295, + lr_schedule="linear_decay", + shuffle_minibatches=False, gamma=0.99, gae_lambda=0.95, with_vtrace=False, recurrence=1, value_bootstrap=False, normalize_input=True, + normalize_returns=True, experiment_summaries_interval=3, save_every_sec=15, - serial_mode=False, async_rl=False, ) - # environment specific overrides - env_name = "_".join(env.split("_")[1:]).lower() - - if env_name == "halfcheetah": - parser.set_defaults( - reward_scale=0.1, - learning_rate=3e-3, - lr_schedule="kl_adaptive_epoch", - lr_schedule_kl_threshold=3e-2, - normalize_input=False, - num_batches_per_epoch=1, - ) - if env_name == "humanoid": - parser.set_defaults( - learning_rate=3e-4, - ) - if env_name == "hopper": - parser.set_defaults( - reward_scale=0.1, - learning_rate=3e-3, - lr_schedule="kl_adaptive_epoch", - lr_schedule_kl_threshold=3e-2, - # normalize_input=False, - # num_batches_per_epoch=1, - # normalize_returns=True, - # hidden_size=128, - ) - if env_name == "doublependulum": - parser.set_defaults( - reward_scale=0.01, - learning_rate=3e-3, - lr_schedule="kl_adaptive_epoch", - lr_schedule_kl_threshold=3e-2, - ) - if env_name == "pendulum": - parser.set_defaults( - # reward_scale=0.01, - learning_rate=3e-4, - lr_schedule="kl_adaptive_epoch", - lr_schedule_kl_threshold=3e-3, - ) - if env_name == "reacher": - parser.set_defaults( - reward_scale=0.1, - learning_rate=3e-3, - lr_schedule="kl_adaptive_epoch", - lr_schedule_kl_threshold=3e-2, - normalize_input=False, - num_batches_per_epoch=1, - ) - if env_name == "swimmer": - parser.set_defaults( - reward_scale=1, - learning_rate=3e-4, - lr_schedule="kl_adaptive_epoch", - lr_schedule_kl_threshold=3e-3, - # normalize_input=False, - # num_batches_per_epoch=1, - normalize_returns=True, - hidden_size=128, - ) - if env_name == "walker": - parser.set_defaults( - reward_scale=0.1, - learning_rate=3e-3, - lr_schedule="kl_adaptive_epoch", - lr_schedule_kl_threshold=3e-2, - # normalize_returns=True, - # normalize_input=False, - # num_batches_per_epoch=1, - ) + # noinspection PyUnusedLocal diff --git a/sample_factory/model/model_utils.py b/sample_factory/model/model_utils.py index df6c82c..d8226d8 100644 --- a/sample_factory/model/model_utils.py +++ b/sample_factory/model/model_utils.py @@ -276,7 +276,7 @@ class MlpEncoder(EncoderBase): self.init_fc_blocks(fc_encoder_layer) def forward(self, obs_dict): - x = self.mlp_head(obs_dict['obs'].float()) + x = self.mlp_head(obs_dict["obs"].float()) x = self.forward_fc_blocks(x) return x diff --git a/sample_factory/runner/runs/mujoco_all_envs.py b/sample_factory/runner/runs/mujoco_all_envs.py index 3ac67ce..5cbaa1a 100644 --- a/sample_factory/runner/runs/mujoco_all_envs.py +++ b/sample_factory/runner/runs/mujoco_all_envs.py @@ -8,12 +8,12 @@ _params = ParamGrid( [ "mujoco_ant", "mujoco_halfcheetah", - "mujoco_hopper", + # "mujoco_hopper", "mujoco_humanoid", - "mujoco_doublependulum", - "mujoco_pendulum", - "mujoco_reacher", - "mujoco_swimmer", + # "mujoco_doublependulum", + # "mujoco_pendulum", + # "mujoco_reacher", + # "mujoco_swimmer", "mujoco_walker", ], ), @@ -23,11 +23,11 @@ _params = ParamGrid( _experiments = [ Experiment( "mujoco_all_envs", - "python -m sample_factory_examples.mujoco_examples.train_mujoco --algo=APPO --with_wandb=True --wandb_tags mujoco runner_4", + "python -m sample_factory_examples.mujoco_examples.train_mujoco --algo=APPO --with_wandb=True --wandb_tags mujoco runner_crl_4", _params.generate_params(randomize=False), ), ] RUN_DESCRIPTION = RunDescription("mujoco_all_envs", experiments=_experiments) -# python -m sample_factory.runner.run --run=mujoco_all_envs --runner=processes --max_parallel=8 --pause_between=1 --experiments_per_gpu=10000 --num_gpus=1 --experiment_suffix=4 +# python -m sample_factory.runner.run --run=mujoco_all_envs --runner=processes --max_parallel=4 --pause_between=1 --experiments_per_gpu=32 --num_gpus=1 --experiment_suffix=crl_3