andrewzhang505's picture
Upload with huggingface_hub
84a6a5e
raw history blame
No virus
11.4 kB
diff --git a/sample_factory/algo/learning/learner.py b/sample_factory/algo/learning/learner.py
index 178d2ab..20bb937 100644
--- a/sample_factory/algo/learning/learner.py
+++ b/sample_factory/algo/learning/learner.py
@@ -110,6 +110,20 @@ class KlAdaptiveSchedulerPerEpoch(KlAdaptiveScheduler):
def invoke_after_each_epoch(self):
return True
+class LinearDecayScheduler(LearningRateScheduler):
+ def __init__(self, cfg):
+ num_updates = cfg.train_for_env_steps // cfg.batch_size * cfg.num_epochs
+ self.linear_decay = LinearDecay([(0, cfg.learning_rate), (num_updates, 0)])
+ self.step = 0
+
+ def invoke_after_each_minibatch(self):
+ return True
+
+ def update(self, current_lr, recent_kls):
+ self.step += 1
+ lr = self.linear_decay.at(self.step)
+ return lr
+
def get_lr_scheduler(cfg) -> LearningRateScheduler:
if cfg.lr_schedule == "constant":
@@ -118,6 +132,8 @@ def get_lr_scheduler(cfg) -> LearningRateScheduler:
return KlAdaptiveSchedulerPerMinibatch(cfg)
elif cfg.lr_schedule == "kl_adaptive_epoch":
return KlAdaptiveSchedulerPerEpoch(cfg)
+ elif cfg.lr_schedule == "linear_decay":
+ return LinearDecayScheduler(cfg)
else:
raise RuntimeError(f"Unknown scheduler {cfg.lr_schedule}")
diff --git a/sample_factory/envs/mujoco/mujoco_params.py b/sample_factory/envs/mujoco/mujoco_params.py
index ef0b486..cb4b977 100644
--- a/sample_factory/envs/mujoco/mujoco_params.py
+++ b/sample_factory/envs/mujoco/mujoco_params.py
@@ -1,117 +1,155 @@
+# def mujoco_override_defaults(env, parser):
+# parser.set_defaults(
+# batched_sampling=False,
+# num_workers=8,
+# num_envs_per_worker=16,
+# worker_num_splits=2,
+# train_for_env_steps=1000000,
+# encoder_type="mlp",
+# encoder_subtype="mlp_mujoco",
+# hidden_size=64,
+# encoder_extra_fc_layers=0,
+# env_frameskip=1,
+# nonlinearity="tanh",
+# batch_size=64,
+# kl_loss_coeff=0.1,
+# use_rnn=False,
+# adaptive_stddev=False,
+# policy_initialization="torch_default",
+# reward_scale=0.01,
+# rollout=8,
+# max_grad_norm=0.0,
+# ppo_epochs=10,
+# num_batches_per_epoch=32,
+# ppo_clip_ratio=0.2,
+# value_loss_coeff=2.0,
+# exploration_loss_coeff=0.0,
+# learning_rate=3e-3,
+# lr_schedule="constant",
+# shuffle_minibatches=True,
+# gamma=0.99,
+# gae_lambda=0.95,
+# with_vtrace=False,
+# recurrence=1,
+# value_bootstrap=False,
+# normalize_input=True,
+# experiment_summaries_interval=3,
+# save_every_sec=15,
+# serial_mode=False,
+# async_rl=False,
+# )
+
+# # environment specific overrides
+# env_name = "_".join(env.split("_")[1:]).lower()
+
+# if env_name == "halfcheetah":
+# parser.set_defaults(
+# reward_scale=0.1,
+# learning_rate=3e-3,
+# lr_schedule="kl_adaptive_epoch",
+# lr_schedule_kl_threshold=3e-2,
+# normalize_input=False,
+# num_batches_per_epoch=1,
+# )
+# if env_name == "humanoid":
+# parser.set_defaults(
+# learning_rate=3e-4,
+# )
+# if env_name == "hopper":
+# parser.set_defaults(
+# reward_scale=0.1,
+# learning_rate=3e-3,
+# lr_schedule="kl_adaptive_epoch",
+# lr_schedule_kl_threshold=3e-2,
+# # normalize_input=False,
+# # num_batches_per_epoch=1,
+# # normalize_returns=True,
+# # hidden_size=128,
+# )
+# if env_name == "doublependulum":
+# parser.set_defaults(
+# reward_scale=0.01,
+# learning_rate=3e-3,
+# lr_schedule="kl_adaptive_epoch",
+# lr_schedule_kl_threshold=3e-2,
+# )
+# if env_name == "pendulum":
+# parser.set_defaults(
+# # reward_scale=0.01,
+# learning_rate=3e-4,
+# lr_schedule="kl_adaptive_epoch",
+# lr_schedule_kl_threshold=3e-3,
+# )
+# if env_name == "reacher":
+# parser.set_defaults(
+# reward_scale=0.1,
+# learning_rate=3e-3,
+# lr_schedule="kl_adaptive_epoch",
+# lr_schedule_kl_threshold=3e-2,
+# normalize_input=False,
+# num_batches_per_epoch=1,
+# )
+# if env_name == "swimmer":
+# parser.set_defaults(
+# reward_scale=1,
+# # learning_rate=3e-3,
+# # lr_schedule="kl_adaptive_epoch",
+# # lr_schedule_kl_threshold=3e-2,
+# # gamma=0.9995,
+# rollout=128,
+# batch_size=128,
+# )
+# if env_name == "walker":
+# parser.set_defaults(
+# reward_scale=0.1,
+# learning_rate=3e-3,
+# lr_schedule="kl_adaptive_epoch",
+# lr_schedule_kl_threshold=3e-2,
+# )
+
def mujoco_override_defaults(env, parser):
parser.set_defaults(
batched_sampling=False,
num_workers=8,
- num_envs_per_worker=16,
+ num_envs_per_worker=8,
worker_num_splits=2,
- train_for_env_steps=1000000,
+ train_for_env_steps=10000000,
encoder_type="mlp",
encoder_subtype="mlp_mujoco",
hidden_size=64,
encoder_extra_fc_layers=0,
env_frameskip=1,
nonlinearity="tanh",
- batch_size=64,
+ batch_size=1024,
kl_loss_coeff=0.1,
-
use_rnn=False,
adaptive_stddev=False,
policy_initialization="torch_default",
- reward_scale=0.01,
- rollout=8,
- max_grad_norm=0.0,
- ppo_epochs=10,
- num_batches_per_epoch=32,
+ reward_scale=1,
+ rollout=64,
+ max_grad_norm=3.5,
+ num_epochs=2,
+ num_batches_per_epoch=4,
ppo_clip_ratio=0.2,
- value_loss_coeff=2.0,
+ value_loss_coeff=1.3,
exploration_loss_coeff=0.0,
- learning_rate=3e-3,
- lr_schedule="constant",
- shuffle_minibatches=True,
+ learning_rate=0.00295,
+ lr_schedule="linear_decay",
+ shuffle_minibatches=False,
gamma=0.99,
gae_lambda=0.95,
with_vtrace=False,
recurrence=1,
value_bootstrap=False,
normalize_input=True,
+ normalize_returns=True,
experiment_summaries_interval=3,
save_every_sec=15,
-
serial_mode=False,
async_rl=False,
)
- # environment specific overrides
- env_name = "_".join(env.split("_")[1:]).lower()
-
- if env_name == "halfcheetah":
- parser.set_defaults(
- reward_scale=0.1,
- learning_rate=3e-3,
- lr_schedule="kl_adaptive_epoch",
- lr_schedule_kl_threshold=3e-2,
- normalize_input=False,
- num_batches_per_epoch=1,
- )
- if env_name == "humanoid":
- parser.set_defaults(
- learning_rate=3e-4,
- )
- if env_name == "hopper":
- parser.set_defaults(
- reward_scale=0.1,
- learning_rate=3e-3,
- lr_schedule="kl_adaptive_epoch",
- lr_schedule_kl_threshold=3e-2,
- # normalize_input=False,
- # num_batches_per_epoch=1,
- # normalize_returns=True,
- # hidden_size=128,
- )
- if env_name == "doublependulum":
- parser.set_defaults(
- reward_scale=0.01,
- learning_rate=3e-3,
- lr_schedule="kl_adaptive_epoch",
- lr_schedule_kl_threshold=3e-2,
- )
- if env_name == "pendulum":
- parser.set_defaults(
- # reward_scale=0.01,
- learning_rate=3e-4,
- lr_schedule="kl_adaptive_epoch",
- lr_schedule_kl_threshold=3e-3,
- )
- if env_name == "reacher":
- parser.set_defaults(
- reward_scale=0.1,
- learning_rate=3e-3,
- lr_schedule="kl_adaptive_epoch",
- lr_schedule_kl_threshold=3e-2,
- normalize_input=False,
- num_batches_per_epoch=1,
- )
- if env_name == "swimmer":
- parser.set_defaults(
- reward_scale=1,
- learning_rate=3e-4,
- lr_schedule="kl_adaptive_epoch",
- lr_schedule_kl_threshold=3e-3,
- # normalize_input=False,
- # num_batches_per_epoch=1,
- normalize_returns=True,
- hidden_size=128,
- )
- if env_name == "walker":
- parser.set_defaults(
- reward_scale=0.1,
- learning_rate=3e-3,
- lr_schedule="kl_adaptive_epoch",
- lr_schedule_kl_threshold=3e-2,
- # normalize_returns=True,
- # normalize_input=False,
- # num_batches_per_epoch=1,
- )
+
# noinspection PyUnusedLocal
diff --git a/sample_factory/model/model_utils.py b/sample_factory/model/model_utils.py
index df6c82c..d8226d8 100644
--- a/sample_factory/model/model_utils.py
+++ b/sample_factory/model/model_utils.py
@@ -276,7 +276,7 @@ class MlpEncoder(EncoderBase):
self.init_fc_blocks(fc_encoder_layer)
def forward(self, obs_dict):
- x = self.mlp_head(obs_dict['obs'].float())
+ x = self.mlp_head(obs_dict["obs"].float())
x = self.forward_fc_blocks(x)
return x
diff --git a/sample_factory/runner/runs/mujoco_all_envs.py b/sample_factory/runner/runs/mujoco_all_envs.py
index 3ac67ce..5cbaa1a 100644
--- a/sample_factory/runner/runs/mujoco_all_envs.py
+++ b/sample_factory/runner/runs/mujoco_all_envs.py
@@ -8,12 +8,12 @@ _params = ParamGrid(
[
"mujoco_ant",
"mujoco_halfcheetah",
- "mujoco_hopper",
+ # "mujoco_hopper",
"mujoco_humanoid",
- "mujoco_doublependulum",
- "mujoco_pendulum",
- "mujoco_reacher",
- "mujoco_swimmer",
+ # "mujoco_doublependulum",
+ # "mujoco_pendulum",
+ # "mujoco_reacher",
+ # "mujoco_swimmer",
"mujoco_walker",
],
),
@@ -23,11 +23,11 @@ _params = ParamGrid(
_experiments = [
Experiment(
"mujoco_all_envs",
- "python -m sample_factory_examples.mujoco_examples.train_mujoco --algo=APPO --with_wandb=True --wandb_tags mujoco runner_4",
+ "python -m sample_factory_examples.mujoco_examples.train_mujoco --algo=APPO --with_wandb=True --wandb_tags mujoco runner_crl_4",
_params.generate_params(randomize=False),
),
]
RUN_DESCRIPTION = RunDescription("mujoco_all_envs", experiments=_experiments)
-# python -m sample_factory.runner.run --run=mujoco_all_envs --runner=processes --max_parallel=8 --pause_between=1 --experiments_per_gpu=10000 --num_gpus=1 --experiment_suffix=4
+# python -m sample_factory.runner.run --run=mujoco_all_envs --runner=processes --max_parallel=4 --pause_between=1 --experiments_per_gpu=32 --num_gpus=1 --experiment_suffix=crl_3