Add Hydra source training config
Browse files- training_config_source.yaml +98 -0
training_config_source.yaml
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# @package _global_
|
| 2 |
+
# REFERENCE COPY — canonical is in this workstream's local lerobot clone at:
|
| 3 |
+
# ./lerobot/lerobot/configs/policy/act_diffusion_aloha_solo_real.yaml
|
| 4 |
+
|
| 5 |
+
# Single-arm (LEFT) ALOHA — Hybrid ACT+Diffusion policy.
|
| 6 |
+
# ACT encoder (ResNet18 + transformer) → DDIM diffusion U-Net → action chunks.
|
| 7 |
+
# 2 cameras: cam_left_wrist + cam_high. state_dim=action_dim=9.
|
| 8 |
+
# DOE winner: batch=24, lr=3e-5 (2026-04-20)
|
| 9 |
+
|
| 10 |
+
seed: 1000
|
| 11 |
+
dataset_repo_id: JHeisler/aloha_solo_left_4_6_26
|
| 12 |
+
|
| 13 |
+
override_dataset_stats:
|
| 14 |
+
observation.images.cam_left_wrist:
|
| 15 |
+
mean: [[[0.485]], [[0.456]], [[0.406]]]
|
| 16 |
+
std: [[[0.229]], [[0.224]], [[0.225]]]
|
| 17 |
+
observation.images.cam_high:
|
| 18 |
+
mean: [[[0.485]], [[0.456]], [[0.406]]]
|
| 19 |
+
std: [[[0.229]], [[0.224]], [[0.225]]]
|
| 20 |
+
|
| 21 |
+
use_amp: true
|
| 22 |
+
use_torch_compile: true
|
| 23 |
+
|
| 24 |
+
training:
|
| 25 |
+
offline_steps: 40000
|
| 26 |
+
online_steps: 0
|
| 27 |
+
eval_freq: -1
|
| 28 |
+
save_freq: 10000
|
| 29 |
+
log_freq: 100
|
| 30 |
+
save_checkpoint: true
|
| 31 |
+
|
| 32 |
+
batch_size: 28
|
| 33 |
+
lr: 3.5e-5
|
| 34 |
+
lr_backbone: 3.5e-5
|
| 35 |
+
lr_warmup_steps: 500
|
| 36 |
+
drop_n_last_frames: 2
|
| 37 |
+
weight_decay: 1e-4
|
| 38 |
+
grad_clip_norm: 10
|
| 39 |
+
online_steps_between_rollouts: 1
|
| 40 |
+
|
| 41 |
+
delta_timestamps:
|
| 42 |
+
action: "[i / ${fps} for i in range(${policy.chunk_size})]"
|
| 43 |
+
|
| 44 |
+
eval:
|
| 45 |
+
n_episodes: 50
|
| 46 |
+
batch_size: 50
|
| 47 |
+
|
| 48 |
+
policy:
|
| 49 |
+
name: hybrid_act_diffusion
|
| 50 |
+
|
| 51 |
+
n_obs_steps: 1
|
| 52 |
+
chunk_size: 100
|
| 53 |
+
n_action_steps: 100
|
| 54 |
+
|
| 55 |
+
input_shapes:
|
| 56 |
+
observation.images.cam_left_wrist: [3, 480, 640]
|
| 57 |
+
observation.images.cam_high: [3, 480, 640]
|
| 58 |
+
observation.state: ["${env.state_dim}"]
|
| 59 |
+
output_shapes:
|
| 60 |
+
action: ["${env.action_dim}"]
|
| 61 |
+
|
| 62 |
+
input_normalization_modes:
|
| 63 |
+
observation.images.cam_left_wrist: mean_std
|
| 64 |
+
observation.images.cam_high: mean_std
|
| 65 |
+
observation.state: mean_std
|
| 66 |
+
output_normalization_modes:
|
| 67 |
+
action: mean_std
|
| 68 |
+
|
| 69 |
+
# ACT visual encoder
|
| 70 |
+
vision_backbone: resnet18
|
| 71 |
+
pretrained_backbone_weights: ResNet18_Weights.IMAGENET1K_V1
|
| 72 |
+
replace_final_stride_with_dilation: false
|
| 73 |
+
pre_norm: false
|
| 74 |
+
dim_model: 512
|
| 75 |
+
n_heads: 8
|
| 76 |
+
dim_feedforward: 3200
|
| 77 |
+
feedforward_activation: relu
|
| 78 |
+
n_encoder_layers: 4
|
| 79 |
+
dropout: 0.1
|
| 80 |
+
|
| 81 |
+
# Diffusion U-Net
|
| 82 |
+
down_dims: [256, 512]
|
| 83 |
+
kernel_size: 5
|
| 84 |
+
n_groups: 8
|
| 85 |
+
diffusion_step_embed_dim: 128
|
| 86 |
+
use_film_scale_modulation: true
|
| 87 |
+
|
| 88 |
+
# Noise scheduler
|
| 89 |
+
noise_scheduler_type: DDPM
|
| 90 |
+
num_train_timesteps: 100
|
| 91 |
+
beta_schedule: squaredcos_cap_v2
|
| 92 |
+
beta_start: 0.0001
|
| 93 |
+
beta_end: 0.02
|
| 94 |
+
prediction_type: epsilon
|
| 95 |
+
clip_sample: true
|
| 96 |
+
clip_sample_range: 1.0
|
| 97 |
+
num_inference_steps: 10
|
| 98 |
+
do_mask_loss_for_padding: true
|