|
|
|
|
|
|
|
|
|
@@ -110,6 +110,20 @@ class KlAdaptiveSchedulerPerEpoch(KlAdaptiveScheduler): |
|
def invoke_after_each_epoch(self): |
|
return True |
|
|
|
+class LinearDecayScheduler(LearningRateScheduler): |
|
+ def __init__(self, cfg): |
|
+ num_updates = cfg.train_for_env_steps // cfg.batch_size * cfg.num_epochs |
|
+ self.linear_decay = LinearDecay([(0, cfg.learning_rate), (num_updates, 0)]) |
|
+ self.step = 0 |
|
+ |
|
+ def invoke_after_each_minibatch(self): |
|
+ return True |
|
+ |
|
+ def update(self, current_lr, recent_kls): |
|
+ self.step += 1 |
|
+ lr = self.linear_decay.at(self.step) |
|
+ return lr |
|
+ |
|
|
|
def get_lr_scheduler(cfg) -> LearningRateScheduler: |
|
if cfg.lr_schedule == "constant": |
|
@@ -118,6 +132,8 @@ def get_lr_scheduler(cfg) -> LearningRateScheduler: |
|
return KlAdaptiveSchedulerPerMinibatch(cfg) |
|
elif cfg.lr_schedule == "kl_adaptive_epoch": |
|
return KlAdaptiveSchedulerPerEpoch(cfg) |
|
+ elif cfg.lr_schedule == "linear_decay": |
|
+ return LinearDecayScheduler(cfg) |
|
else: |
|
raise RuntimeError(f"Unknown scheduler {cfg.lr_schedule}") |
|
|
|
|
|
|
|
|
|
|
|
@@ -1,117 +1,155 @@ |
|
+# def mujoco_override_defaults(env, parser): |
|
+# parser.set_defaults( |
|
+# batched_sampling=False, |
|
+# num_workers=8, |
|
+# num_envs_per_worker=16, |
|
+# worker_num_splits=2, |
|
+# train_for_env_steps=1000000, |
|
+# encoder_type="mlp", |
|
+# encoder_subtype="mlp_mujoco", |
|
+# hidden_size=64, |
|
+# encoder_extra_fc_layers=0, |
|
+# env_frameskip=1, |
|
+# nonlinearity="tanh", |
|
+# batch_size=64, |
|
+# kl_loss_coeff=0.1, |
|
+# use_rnn=False, |
|
+# adaptive_stddev=False, |
|
+# policy_initialization="torch_default", |
|
+# reward_scale=0.01, |
|
+# rollout=8, |
|
+# max_grad_norm=0.0, |
|
+# ppo_epochs=10, |
|
+# num_batches_per_epoch=32, |
|
+# ppo_clip_ratio=0.2, |
|
+# value_loss_coeff=2.0, |
|
+# exploration_loss_coeff=0.0, |
|
+# learning_rate=3e-3, |
|
+# lr_schedule="constant", |
|
+# shuffle_minibatches=True, |
|
+# gamma=0.99, |
|
+# gae_lambda=0.95, |
|
+# with_vtrace=False, |
|
+# recurrence=1, |
|
+# value_bootstrap=False, |
|
+# normalize_input=True, |
|
+# experiment_summaries_interval=3, |
|
+# save_every_sec=15, |
|
+# serial_mode=False, |
|
+# async_rl=False, |
|
+# ) |
|
+ |
|
+# # environment specific overrides |
|
+# env_name = "_".join(env.split("_")[1:]).lower() |
|
+ |
|
+# if env_name == "halfcheetah": |
|
+# parser.set_defaults( |
|
+# reward_scale=0.1, |
|
+# learning_rate=3e-3, |
|
+# lr_schedule="kl_adaptive_epoch", |
|
+# lr_schedule_kl_threshold=3e-2, |
|
+# normalize_input=False, |
|
+# num_batches_per_epoch=1, |
|
+# ) |
|
+# if env_name == "humanoid": |
|
+# parser.set_defaults( |
|
+# learning_rate=3e-4, |
|
+# ) |
|
+# if env_name == "hopper": |
|
+# parser.set_defaults( |
|
+# reward_scale=0.1, |
|
+# learning_rate=3e-3, |
|
+# lr_schedule="kl_adaptive_epoch", |
|
+# lr_schedule_kl_threshold=3e-2, |
|
+# # normalize_input=False, |
|
+# # num_batches_per_epoch=1, |
|
+# # normalize_returns=True, |
|
+# # hidden_size=128, |
|
+# ) |
|
+# if env_name == "doublependulum": |
|
+# parser.set_defaults( |
|
+# reward_scale=0.01, |
|
+# learning_rate=3e-3, |
|
+# lr_schedule="kl_adaptive_epoch", |
|
+# lr_schedule_kl_threshold=3e-2, |
|
+# ) |
|
+# if env_name == "pendulum": |
|
+# parser.set_defaults( |
|
+# # reward_scale=0.01, |
|
+# learning_rate=3e-4, |
|
+# lr_schedule="kl_adaptive_epoch", |
|
+# lr_schedule_kl_threshold=3e-3, |
|
+# ) |
|
+# if env_name == "reacher": |
|
+# parser.set_defaults( |
|
+# reward_scale=0.1, |
|
+# learning_rate=3e-3, |
|
+# lr_schedule="kl_adaptive_epoch", |
|
+# lr_schedule_kl_threshold=3e-2, |
|
+# normalize_input=False, |
|
+# num_batches_per_epoch=1, |
|
+# ) |
|
+# if env_name == "swimmer": |
|
+# parser.set_defaults( |
|
+# reward_scale=1, |
|
+# # learning_rate=3e-3, |
|
+# # lr_schedule="kl_adaptive_epoch", |
|
+# # lr_schedule_kl_threshold=3e-2, |
|
+# # gamma=0.9995, |
|
+# rollout=128, |
|
+# batch_size=128, |
|
+# ) |
|
+# if env_name == "walker": |
|
+# parser.set_defaults( |
|
+# reward_scale=0.1, |
|
+# learning_rate=3e-3, |
|
+# lr_schedule="kl_adaptive_epoch", |
|
+# lr_schedule_kl_threshold=3e-2, |
|
+# ) |
|
+ |
|
def mujoco_override_defaults(env, parser): |
|
parser.set_defaults( |
|
batched_sampling=False, |
|
num_workers=8, |
|
- num_envs_per_worker=16, |
|
+ num_envs_per_worker=8, |
|
worker_num_splits=2, |
|
- train_for_env_steps=1000000, |
|
+ train_for_env_steps=10000000, |
|
encoder_type="mlp", |
|
encoder_subtype="mlp_mujoco", |
|
hidden_size=64, |
|
encoder_extra_fc_layers=0, |
|
env_frameskip=1, |
|
nonlinearity="tanh", |
|
- batch_size=64, |
|
+ batch_size=1024, |
|
kl_loss_coeff=0.1, |
|
- |
|
use_rnn=False, |
|
adaptive_stddev=False, |
|
policy_initialization="torch_default", |
|
- reward_scale=0.01, |
|
- rollout=8, |
|
- max_grad_norm=0.0, |
|
- ppo_epochs=10, |
|
- num_batches_per_epoch=32, |
|
+ reward_scale=1, |
|
+ rollout=64, |
|
+ max_grad_norm=3.5, |
|
+ num_epochs=2, |
|
+ num_batches_per_epoch=4, |
|
ppo_clip_ratio=0.2, |
|
- value_loss_coeff=2.0, |
|
+ value_loss_coeff=1.3, |
|
exploration_loss_coeff=0.0, |
|
- learning_rate=3e-3, |
|
- lr_schedule="constant", |
|
- shuffle_minibatches=True, |
|
+ learning_rate=0.00295, |
|
+ lr_schedule="linear_decay", |
|
+ shuffle_minibatches=False, |
|
gamma=0.99, |
|
gae_lambda=0.95, |
|
with_vtrace=False, |
|
recurrence=1, |
|
value_bootstrap=False, |
|
normalize_input=True, |
|
+ normalize_returns=True, |
|
experiment_summaries_interval=3, |
|
save_every_sec=15, |
|
- |
|
serial_mode=False, |
|
async_rl=False, |
|
) |
|
|
|
- # environment specific overrides |
|
- env_name = "_".join(env.split("_")[1:]).lower() |
|
- |
|
- if env_name == "halfcheetah": |
|
- parser.set_defaults( |
|
- reward_scale=0.1, |
|
- learning_rate=3e-3, |
|
- lr_schedule="kl_adaptive_epoch", |
|
- lr_schedule_kl_threshold=3e-2, |
|
- normalize_input=False, |
|
- num_batches_per_epoch=1, |
|
- ) |
|
- if env_name == "humanoid": |
|
- parser.set_defaults( |
|
- learning_rate=3e-4, |
|
- ) |
|
- if env_name == "hopper": |
|
- parser.set_defaults( |
|
- reward_scale=0.1, |
|
- learning_rate=3e-3, |
|
- lr_schedule="kl_adaptive_epoch", |
|
- lr_schedule_kl_threshold=3e-2, |
|
- # normalize_input=False, |
|
- # num_batches_per_epoch=1, |
|
- # normalize_returns=True, |
|
- # hidden_size=128, |
|
- ) |
|
- if env_name == "doublependulum": |
|
- parser.set_defaults( |
|
- reward_scale=0.01, |
|
- learning_rate=3e-3, |
|
- lr_schedule="kl_adaptive_epoch", |
|
- lr_schedule_kl_threshold=3e-2, |
|
- ) |
|
- if env_name == "pendulum": |
|
- parser.set_defaults( |
|
- # reward_scale=0.01, |
|
- learning_rate=3e-4, |
|
- lr_schedule="kl_adaptive_epoch", |
|
- lr_schedule_kl_threshold=3e-3, |
|
- ) |
|
- if env_name == "reacher": |
|
- parser.set_defaults( |
|
- reward_scale=0.1, |
|
- learning_rate=3e-3, |
|
- lr_schedule="kl_adaptive_epoch", |
|
- lr_schedule_kl_threshold=3e-2, |
|
- normalize_input=False, |
|
- num_batches_per_epoch=1, |
|
- ) |
|
- if env_name == "swimmer": |
|
- parser.set_defaults( |
|
- reward_scale=1, |
|
- learning_rate=3e-4, |
|
- lr_schedule="kl_adaptive_epoch", |
|
- lr_schedule_kl_threshold=3e-3, |
|
- # normalize_input=False, |
|
- # num_batches_per_epoch=1, |
|
- normalize_returns=True, |
|
- hidden_size=128, |
|
- ) |
|
- if env_name == "walker": |
|
- parser.set_defaults( |
|
- reward_scale=0.1, |
|
- learning_rate=3e-3, |
|
- lr_schedule="kl_adaptive_epoch", |
|
- lr_schedule_kl_threshold=3e-2, |
|
- # normalize_returns=True, |
|
- # normalize_input=False, |
|
- # num_batches_per_epoch=1, |
|
- ) |
|
+ |
|
|
|
|
|
# noinspection PyUnusedLocal |
|
|
|
|
|
|
|
|
|
@@ -276,7 +276,7 @@ class MlpEncoder(EncoderBase): |
|
self.init_fc_blocks(fc_encoder_layer) |
|
|
|
def forward(self, obs_dict): |
|
- x = self.mlp_head(obs_dict['obs'].float()) |
|
+ x = self.mlp_head(obs_dict["obs"].float()) |
|
x = self.forward_fc_blocks(x) |
|
return x |
|
|
|
|
|
|
|
|
|
|
|
@@ -8,12 +8,12 @@ _params = ParamGrid( |
|
[ |
|
"mujoco_ant", |
|
"mujoco_halfcheetah", |
|
- "mujoco_hopper", |
|
+ # "mujoco_hopper", |
|
"mujoco_humanoid", |
|
- "mujoco_doublependulum", |
|
- "mujoco_pendulum", |
|
- "mujoco_reacher", |
|
- "mujoco_swimmer", |
|
+ # "mujoco_doublependulum", |
|
+ # "mujoco_pendulum", |
|
+ # "mujoco_reacher", |
|
+ # "mujoco_swimmer", |
|
"mujoco_walker", |
|
], |
|
), |
|
@@ -23,11 +23,11 @@ _params = ParamGrid( |
|
_experiments = [ |
|
Experiment( |
|
"mujoco_all_envs", |
|
- "python -m sample_factory_examples.mujoco_examples.train_mujoco --algo=APPO --with_wandb=True --wandb_tags mujoco runner_4", |
|
+ "python -m sample_factory_examples.mujoco_examples.train_mujoco --algo=APPO --with_wandb=True --wandb_tags mujoco runner_crl_4", |
|
_params.generate_params(randomize=False), |
|
), |
|
] |
|
|
|
|
|
RUN_DESCRIPTION = RunDescription("mujoco_all_envs", experiments=_experiments) |
|
-# python -m sample_factory.runner.run --run=mujoco_all_envs --runner=processes --max_parallel=8 --pause_between=1 --experiments_per_gpu=10000 --num_gpus=1 --experiment_suffix=4 |
|
+# python -m sample_factory.runner.run --run=mujoco_all_envs --runner=processes --max_parallel=4 --pause_between=1 --experiments_per_gpu=32 --num_gpus=1 --experiment_suffix=crl_3 |
|
|