common: | |
# The number of historical images | |
img_history_size: 2 | |
# The number of future actions to predict | |
action_chunk_size: 64 | |
# The number of cameras to be used in the model | |
num_cameras: 3 | |
# Dimension for state/action, we use the same space for both state and action | |
# This MUST be equal to configs/state_vec.py | |
state_dim: 128 | |
dataset: | |
# We will extract the data from raw dataset | |
# and store them in the disk buffer by producer | |
# When training, we will read the data | |
# randomly from the buffer by consumer | |
# The producer will replace the data which has been | |
# read by the consumer with new data | |
# The path to the buffer (at least 400GB) | |
buf_path: /path/to/buffer | |
# The number of chunks in the buffer | |
buf_num_chunks: 512 | |
# The number of samples (step rather than episode) in each chunk | |
buf_chunk_size: 512 | |
# We will filter the episodes with length less than `epsd_len_thresh_low` | |
epsd_len_thresh_low: 32 | |
# For those more than `epsd_len_thresh_high`, | |
# we will randomly sample `epsd_len_thresh_high` steps each time we load the episode | |
# to better balance the training datasets | |
epsd_len_thresh_high: 2048 | |
# How to fit the image size | |
image_aspect_ratio: pad | |
# Maximum number of language tokens | |
tokenizer_max_length: 1024 | |
model: | |
# Config for condition adpators | |
lang_adaptor: mlp2x_gelu | |
img_adaptor: mlp2x_gelu | |
state_adaptor: mlp3x_gelu | |
lang_token_dim: 4096 | |
img_token_dim: 1152 | |
# Dim of action or proprioception vector | |
# A `state` refers to an action or a proprioception vector | |
state_token_dim: 128 | |
# Config for RDT structure | |
rdt: | |
# 1B: num_head 32 hidden_size 2048 | |
hidden_size: 2048 | |
depth: 28 | |
num_heads: 32 | |
cond_pos_embed_type: multimodal | |
# For noise scheduler | |
noise_scheduler: | |
type: ddpm | |
num_train_timesteps: 1000 | |
num_inference_timesteps: 5 | |
beta_schedule: squaredcos_cap_v2 # Critical choice | |
prediction_type: sample | |
clip_sample: False | |
# For EMA (params averaging) | |
# We do not use EMA currently | |
ema: | |
update_after_step: 0 | |
inv_gamma: 1.0 | |
power: 0.75 | |
min_value: 0.0 | |
max_value: 0.9999 | |